From 02bad7a858e763be49c257d957dfd70fab18af9e Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 29 Feb 2024 11:22:14 +0000
Subject: [PATCH 01/55] [AMDGPU] Simplify !if condition. NFC.

---
 llvm/lib/Target/AMDGPU/BUFInstructions.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 7d4d619788d392..4b74f3b81e5e78 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2509,7 +2509,7 @@ class MUBUF_Real_Atomic_gfx12_impl<bits<8> op, string ps_name,
 
 multiclass MUBUF_Real_Atomic_gfx11_Renamed_impl<bits<8> op, bit is_return,
                                                 string real_name> {
-  defvar Rtn = !if(!eq(is_return, 1), "_RTN", "");
+  defvar Rtn = !if(is_return, "_RTN", "");
   def _BOTHEN#Rtn#_gfx11 :
     MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_BOTHEN" # Rtn, real_name>,
     AtomicNoRet<NAME # "_BOTHEN_gfx11", is_return>;
@@ -2526,7 +2526,7 @@ multiclass MUBUF_Real_Atomic_gfx11_Renamed_impl<bits<8> op, bit is_return,
 
 multiclass MUBUF_Real_Atomic_gfx12_Renamed_impl<bits<8> op, bit is_return,
                                                 string real_name> {
-  defvar Rtn = !if(!eq(is_return, 1), "_RTN", "");
+  defvar Rtn = !if(is_return, "_RTN", "");
   def _BOTHEN#Rtn#_gfx12 :
     MUBUF_Real_Atomic_gfx12_impl<op, NAME # "_VBUFFER_BOTHEN" # Rtn, real_name>,
     AtomicNoRet<NAME # "_BOTHEN_gfx12", is_return>;

From 72a60e770cee713a47876483798747cb7db58da6 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs@arm.com>
Date: Wed, 28 Feb 2024 15:04:49 +0000
Subject: [PATCH 02/55] [AArch64][NFC] Use regexes in register class tests

Some MIR and IR tests include checks for register class IDs, which are
unnecessary since the register class name is also checked for and that
doesn't change when new classes are added. This patch replaces the
hard-coded register class ID checks with regexes so they don't have to
be updated every time a new class is added.
---
 .../GlobalISel/irtranslator-inline-asm.ll     | 22 ++++++++--------
 .../irtranslator-unwind-inline-asm.ll         |  2 +-
 llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll |  6 ++---
 .../callbr-asm-outputs-indirect-isel.ll       | 26 +++++++++----------
 .../emit_fneg_with_non_register_operand.mir   |  4 +--
 .../CodeGen/AArch64/peephole-insvigpr.mir     |  2 +-
 6 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
index ef8e4665364086..42f6570047fc7a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
@@ -26,7 +26,7 @@ define void @asm_simple_register_clobber() {
 define i64 @asm_register_early_clobber() {
   ; CHECK-LABEL: name: asm_register_early_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"mov $0, 7; mov $1, 7", 1 /* sideeffect attdialect */, 2752523 /* regdef-ec:GPR64common */, def early-clobber %0, 2752523 /* regdef-ec:GPR64common */, def early-clobber %1, !0
+  ; CHECK-NEXT:   INLINEASM &"mov $0, 7; mov $1, 7", 1 /* sideeffect attdialect */, {{[0-9]+}} /* regdef-ec:GPR64common */, def early-clobber %0, {{[0-9]+}} /* regdef-ec:GPR64common */, def early-clobber %1, !0
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY %0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %1
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
@@ -54,7 +54,7 @@ entry:
 define i32 @test_single_register_output() nounwind ssp {
   ; CHECK-LABEL: name: test_single_register_output
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %0
+  ; CHECK-NEXT:   INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %0
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
   ; CHECK-NEXT:   $w0 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
@@ -66,7 +66,7 @@ entry:
 define i64 @test_single_register_output_s64() nounwind ssp {
   ; CHECK-LABEL: name: test_single_register_output_s64
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"mov $0, 7", 0 /* attdialect */, 2752522 /* regdef:GPR64common */, def %0
+  ; CHECK-NEXT:   INLINEASM &"mov $0, 7", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR64common */, def %0
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY %0
   ; CHECK-NEXT:   $x0 = COPY [[COPY]](s64)
   ; CHECK-NEXT:   RET_ReallyLR implicit $x0
@@ -79,7 +79,7 @@ entry:
 define float @test_multiple_register_outputs_same() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_same
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %0, 1703946 /* regdef:GPR32common */, def %1
+  ; CHECK-NEXT:   INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %0, {{[0-9]+}} /* regdef:GPR32common */, def %1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
   ; CHECK-NEXT:   [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
@@ -96,7 +96,7 @@ define float @test_multiple_register_outputs_same() #0 {
 define double @test_multiple_register_outputs_mixed() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_mixed
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %0, 2555914 /* regdef:FPR64 */, def %1
+  ; CHECK-NEXT:   INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %0, {{[0-9]+}} /* regdef:FPR64 */, def %1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %1
   ; CHECK-NEXT:   $d0 = COPY [[COPY1]](s64)
@@ -125,7 +125,7 @@ define zeroext i8 @test_register_output_trunc(ptr %src) nounwind {
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-  ; CHECK-NEXT:   INLINEASM &"mov ${0:w}, 32", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %1
+  ; CHECK-NEXT:   INLINEASM &"mov ${0:w}, 32", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %1
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8)
@@ -155,7 +155,7 @@ define void @test_input_register_imm() {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY [[C]](s64)
-  ; CHECK-NEXT:   INLINEASM &"mov x0, $0", 1 /* sideeffect attdialect */, 2752521 /* reguse:GPR64common */, [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"mov x0, $0", 1 /* sideeffect attdialect */, {{[0-9]+}} /* reguse:GPR64common */, [[COPY]]
   ; CHECK-NEXT:   RET_ReallyLR
   call void asm sideeffect "mov x0, $0", "r"(i64 42)
   ret void
@@ -190,7 +190,7 @@ define zeroext i8 @test_input_register(ptr %src) nounwind {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]](p0)
-  ; CHECK-NEXT:   INLINEASM &"ldtrb ${0:w}, [$1]", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %1, 2752521 /* reguse:GPR64common */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"ldtrb ${0:w}, [$1]", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %1, {{[0-9]+}} /* reguse:GPR64common */, [[COPY1]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8)
@@ -207,7 +207,7 @@ define i32 @test_memory_constraint(ptr %a) nounwind {
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-  ; CHECK-NEXT:   INLINEASM &"ldr $0, $1", 8 /* mayload attdialect */, 1703946 /* regdef:GPR32common */, def %1, 262158 /* mem:m */, [[COPY]](p0)
+  ; CHECK-NEXT:   INLINEASM &"ldr $0, $1", 8 /* mayload attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %1, 262158 /* mem:m */, [[COPY]](p0)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
   ; CHECK-NEXT:   $w0 = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
@@ -221,7 +221,7 @@ define i16 @test_anyext_input() {
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32common = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1703946 /* regdef:GPR32common */, def %0, 1703945 /* reguse:GPR32common */, [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %0, {{[0-9]+}} /* reguse:GPR32common */, [[COPY]]
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
@@ -237,7 +237,7 @@ define i16 @test_anyext_input_with_matching_constraint() {
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32common = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1703946 /* regdef:GPR32common */, def %0, 2147483657 /* reguse tiedto:$0 */, [[COPY]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %0, 2147483657 /* reguse tiedto:$0 */, [[COPY]](tied-def 3)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll
index 59eb80ae6146b3..fbffb50bcbc8a3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll
@@ -71,7 +71,7 @@ define void @test2() #0 personality ptr @__gcc_personality_v0 {
   ; CHECK-NEXT:   G_INVOKE_REGION_START
   ; CHECK-NEXT:   EH_LABEL <mcsymbol >
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY [[DEF]](p0)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2752521 /* reguse:GPR64common */, [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, {{[0-9]+}} /* reguse:GPR64common */, [[COPY]]
   ; CHECK-NEXT:   EH_LABEL <mcsymbol >
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
index 8ed7059d2e754c..58299696e78fc2 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
@@ -5,7 +5,7 @@ entry:
 ; CHECK:  %0:ppr = COPY $p0
 ; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store unknown-size into %ir.predcnt.addr, align 2)
 ; CHECK:  %1:pnr_p8to15 = COPY %0
-; CHECK:  INLINEASM &"ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", 1 /* sideeffect attdialect */, 458761 /* reguse:PNR_p8to15 */, %1
+; CHECK:  INLINEASM &"ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", 1 /* sideeffect attdialect */, {{[0-9]+}} /* reguse:PNR_p8to15 */, %1
 ; CHECK:  RET_ReallyLR
   %predcnt.addr = alloca target("aarch64.svcount"), align 2
   store target("aarch64.svcount") %predcnt, ptr %predcnt.addr, align 2
@@ -19,7 +19,7 @@ entry:
 ; CHECK:  %0:ppr = COPY $p0
 ; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store unknown-size into %ir.predcnt.addr, align 2)
 ; CHECK:  %1:pnr = COPY %0
-; CHECK:  INLINEASM &"ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", 1 /* sideeffect attdialect */, 262153 /* reguse:PNR */, %1
+; CHECK:  INLINEASM &"ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", 1 /* sideeffect attdialect */, {{[0-9]+}} /* reguse:PNR */, %1
 ; CHECK:  RET_ReallyLR
   %predcnt.addr = alloca target("aarch64.svcount"), align 2
   store target("aarch64.svcount") %predcnt, ptr %predcnt.addr, align 2
@@ -33,7 +33,7 @@ entry:
 ; CHECK:  %0:ppr = COPY $p0
 ; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store unknown-size into %ir.predcnt.addr, align 2)
 ; CHECK:  %1:pnr_3b = COPY %0
-; CHECK:  INLINEASM &"fadd z0.h, $0/m, z0.h, #0.5", 1 /* sideeffect attdialect */, 393225 /* reguse:PNR_3b */, %1
+; CHECK:  INLINEASM &"fadd z0.h, $0/m, z0.h, #0.5", 1 /* sideeffect attdialect */, {{[0-9]+}} /* reguse:PNR_3b */, %1
 ; CHECK:  RET_ReallyLR
   %predcnt.addr = alloca target("aarch64.svcount"), align 2
   store target("aarch64.svcount") %predcnt, ptr %predcnt.addr, align 2
diff --git a/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll b/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll
index 3b7b5dd3fa7a54..fbe89e70e4d8e2 100644
--- a/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll
+++ b/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll
@@ -18,7 +18,7 @@ define i32 @test0() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"# $0", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"# $0", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %5
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -31,7 +31,7 @@ define i32 @test0() {
   ; CHECK-NEXT: bb.2.direct:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000), %bb.3(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"# $0", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %7, 13 /* imm */, %bb.3
+  ; CHECK-NEXT:   INLINEASM_BR &"# $0", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %7, 13 /* imm */, %bb.3
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY %7
   ; CHECK-NEXT:   B %bb.4
   ; CHECK-NEXT: {{  $}}
@@ -107,7 +107,7 @@ define i32 @dont_split1() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %1, 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %1, 13 /* imm */, %bb.2
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %1
   ; CHECK-NEXT:   B %bb.1
   ; CHECK-NEXT: {{  $}}
@@ -168,7 +168,7 @@ define i32 @dont_split3() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %0, 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %0, 13 /* imm */, %bb.2
   ; CHECK-NEXT:   B %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.x:
@@ -194,7 +194,7 @@ define i32 @split_me0() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -244,7 +244,7 @@ define i32 @split_me1(i1 %z) {
   ; CHECK-NEXT: bb.1.w:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000), %bb.2(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32all = COPY %5
   ; CHECK-NEXT:   B %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -297,7 +297,7 @@ define i32 @split_me2(i1 %z) {
   ; CHECK-NEXT: bb.1.w:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000), %bb.2(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %6, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %6, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY %6
   ; CHECK-NEXT:   B %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -340,7 +340,7 @@ define i32 @dont_split4() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.2
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.1
   ; CHECK-NEXT: {{  $}}
@@ -379,7 +379,7 @@ define i32 @dont_split5() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -410,7 +410,7 @@ define i32 @split_me3() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -456,7 +456,7 @@ define i32 @dont_split6(i32 %0) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32all = PHI [[COPY]], %bb.0, %2, %bb.2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32common = COPY [[PHI]]
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3), 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3), 13 /* imm */, %bb.2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY %4
   ; CHECK-NEXT:   B %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -491,7 +491,7 @@ define i32 @split_me4() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -522,7 +522,7 @@ define i32 @split_me5() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1703946 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, {{[0-9]+}} /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir
index 483dbd2f14d556..92fb053b0db726 100644
--- a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir
+++ b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir
@@ -91,10 +91,10 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[LOADgot:%[0-9]+]]:gpr64common = LOADgot target-flags(aarch64-got) @c
   ; CHECK-NEXT:   [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_12_15 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, {{[0-9]+}} /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_12_15 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:fpr64 = COPY %2
   ; CHECK-NEXT:   [[LDRDui1:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_12_15 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, {{[0-9]+}} /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_12_15 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3)
   ; CHECK-NEXT:   [[FNEGDr:%[0-9]+]]:fpr64 = FNEGDr %2
   ; CHECK-NEXT:   nofpexcept FCMPDrr %4, killed [[FNEGDr]], implicit-def $nzcv, implicit $fpcr
   ; CHECK-NEXT:   Bcc 1, %bb.2, implicit $nzcv
diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
index 041b2dc6af1277..65148344096cd7 100644
--- a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
+++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
@@ -487,7 +487,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr64all = IMPLICIT_DEF
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[DEF]]
-    ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 2359306 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_12_15 */, def %1, 262158 /* mem:m */, killed [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, {{[0-9]+}} /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_12_15 */, def %1, 262158 /* mem:m */, killed [[COPY1]]
     ; CHECK-NEXT: [[MOVIv2d_ns:%[0-9]+]]:fpr128 = MOVIv2d_ns 0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[MOVIv2d_ns]].dsub
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF

From 31295bbe83c3ea9d8a3372efe34342a299d1018a Mon Sep 17 00:00:00 2001
From: Mitch Phillips <mitchp@google.com>
Date: Thu, 29 Feb 2024 11:34:37 +0100
Subject: [PATCH 03/55] Revert "AMDGPU: Add scheduling test for gfx940
 (#83220)"

This reverts commit f0484e08bdcf64106592808e3ca80404937b4657.

Reason: Broke the sanitizer build bots. See the github comments on
https://github.com/llvm/llvm-project/commit/f0484e08bdcf64106592808e3ca80404937b4657
for more information
---
 llvm/test/tools/llvm-mca/AMDGPU/gfx940.s | 189 -----------------------
 1 file changed, 189 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-mca/AMDGPU/gfx940.s

diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx940.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx940.s
deleted file mode 100644
index c66d0b44b5e9a6..00000000000000
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx940.s
+++ /dev/null
@@ -1,189 +0,0 @@
-# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx940 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
-
-# CHECK: Iterations:        1
-# CHECK: Instructions:      71
-# CHECK: Total Cycles:      562
-# CHECK: Total uOps:        77
-
-# CHECK: Resources:
-# CHECK: [0]   - HWBranch
-# CHECK: [1]   - HWExport
-# CHECK: [2]   - HWLGKM
-# CHECK: [3]   - HWSALU
-# CHECK: [4]   - HWVALU
-# CHECK: [5]   - HWVMEM
-# CHECK: [6]   - HWXDL
-
-v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1]
-v_pk_mov_b32 v[0:1], v[2:3], v[4:5]
-v_pk_add_f32 v[0:1], v[0:1], v[0:1]
-v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
-v_add_co_u32 v5, s[0:1], v1, v2
-v_sub_co_u32 v5, s[0:1], v1, v2
-v_subrev_co_u32 v5, s[0:1], v1, v2
-v_addc_co_u32 v5, s[0:1], v1, v2, s[2:3]
-v_subb_co_u32 v5, s[0:1], v1, v2, s[2:3]
-v_subbrev_co_u32 v5, s[0:1], v1, v2, s[2:3]
-v_add_u32 v5, v1, v2
-v_sub_u32 v5, v1, v2
-v_subrev_u32 v5, v1, v2
-
-v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
-v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
-
-v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
-v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
-
-v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
-v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-
-v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
-
-v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3]
-v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3]
-
-v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15]
-v_mfma_f32_32x32x8_f16 a[0:15], v[4:5], v[6:7], a[0:15]
-
-v_mfma_f32_16x16x16_bf16 v[0:3], v[4:5], v[6:7], v[0:3]
-v_mfma_f32_16x16x16_bf16 a[0:3], v[4:5], v[6:7], a[0:3]
-
-v_mfma_f32_32x32x8_bf16 v[0:15], v[4:5], v[6:7], v[0:15]
-v_mfma_f32_32x32x8_bf16 a[0:15], v[4:5], v[6:7], a[0:15]
-
-v_mfma_i32_16x16x32_i8 v[0:3], v[4:5], v[6:7], v[0:3]
-v_mfma_i32_16x16x32_i8 a[0:3], v[4:5], v[6:7], a[0:3]
-
-v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15]
-v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15]
-
-v_mfma_f32_4x4x4_16b_f16 v[0:3], v[0:1], v[2:3], v[2:5]
-v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[2:5]
-
-v_mfma_f32_16x16x4_4b_f16 v[0:15], v[2:3], v[4:5], v[18:33]
-v_mfma_f32_16x16x4_4b_f16 a[0:15], v[2:3], v[4:5], a[18:33]
-
-v_mfma_f32_32x32x4_2b_f16 v[0:31], v[0:1], v[2:3], v[34:65]
-v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[34:65]
-
-v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[0:1], v[2:3], v[2:5]
-v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[0:1], v[2:3], a[2:5]
-
-v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[2:3], v[4:5], v[18:33]
-v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[4:5], a[18:33]
-
-v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[0:1], v[2:3], v[34:65]
-v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[0:1], v[2:3], a[34:65]
-
-v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
-v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[2:5]
-
-v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, v[18:33]
-v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[18:33]
-
-v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
-v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
-
-v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, v[34:65] blgp:7
-v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[34:65] blgp:7
-
-v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
-v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
-
-v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, v[2:5]
-v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v1, a[2:5]
-
-v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, v[18:33]
-v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[18:33]
-
-v_mfma_i32_32x32x4_2b_i8 v[0:31], v0, v1, v[34:65]
-v_mfma_i32_32x32x4_2b_i8 a[0:31], v0, v1, a[34:65]
-
-v_smfmac_f32_16x16x32_f16 v[10:13], a[2:3], v[4:7], v0 cbsz:3 abid:1
-v_smfmac_f32_16x16x32_f16 a[10:13], v[2:3], a[4:7], v1
-
-v_smfmac_f32_32x32x16_f16 v[10:25], a[2:3], v[4:7], v2 cbsz:3 abid:1
-v_smfmac_f32_32x32x16_f16 a[10:25], v[2:3], a[4:7], v3
-
-v_smfmac_f32_16x16x32_bf16 v[10:13], a[2:3], v[4:7], v4 cbsz:3 abid:1
-v_smfmac_f32_16x16x32_bf16 a[10:13], v[2:3], a[4:7], v5
-
-v_smfmac_i32_16x16x64_i8 v[10:13], a[2:3], v[4:7], v8 cbsz:3 abid:1
-v_smfmac_i32_16x16x64_i8 a[10:13], v[2:3], a[4:7], v9
-
-v_smfmac_i32_32x32x32_i8 v[10:25], a[2:3], v[4:7], v10 cbsz:3 abid:1
-v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11
-
-# CHECK:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_pk_mov_b32 v[0:1], v[2:3], v[4:5]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_pk_add_f32 v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: -      -      -     1.00   1.00    -      -     v_add_co_u32_e64 v5, s[0:1], v1, v2
-# CHECK-NEXT: -      -      -     1.00   1.00    -      -     v_sub_co_u32_e64 v5, s[0:1], v1, v2
-# CHECK-NEXT: -      -      -     1.00   1.00    -      -     v_subrev_co_u32_e64 v5, s[0:1], v1, v2
-# CHECK-NEXT: -      -      -     1.00   1.00    -      -     v_addc_co_u32_e64 v5, s[0:1], v1, v2, s[2:3]
-# CHECK-NEXT: -      -      -     1.00   1.00    -      -     v_subb_co_u32_e64 v5, s[0:1], v1, v2, s[2:3]
-# CHECK-NEXT: -      -      -     1.00   1.00    -      -     v_subbrev_co_u32_e64 v5, s[0:1], v1, v2, s[2:3]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_add_u32_e32 v5, v1, v2
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_sub_u32_e32 v5, v1, v2
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_subrev_u32_e32 v5, v1, v2
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-# CHECK-NEXT: -      -      -      -     1.00    -      -     v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3]
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_32x32x8_f16 a[0:15], v[4:5], v[6:7], a[0:15]
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_mfma_f32_16x16x16_bf16 v[0:3], v[4:5], v[6:7], v[0:3]
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_mfma_f32_16x16x16_bf16 a[0:3], v[4:5], v[6:7], a[0:3]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_32x32x8_bf16 v[0:15], v[4:5], v[6:7], v[0:15]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_32x32x8_bf16 a[0:15], v[4:5], v[6:7], a[0:15]
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_mfma_i32_16x16x32_i8 v[0:3], v[4:5], v[6:7], v[0:3]
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_mfma_i32_16x16x32_i8 a[0:3], v[4:5], v[6:7], a[0:3]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15]
-# CHECK-NEXT: -      -      -      -      -      -     2.00   v_mfma_f32_4x4x4_16b_f16 v[0:3], v[0:1], v[2:3], v[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     2.00   v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_4b_f16 v[0:15], v[2:3], v[4:5], v[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_4b_f16 a[0:15], v[2:3], v[4:5], a[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x4_2b_f16 v[0:31], v[0:1], v[2:3], v[34:65]
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[34:65]
-# CHECK-NEXT: -      -      -      -      -      -     2.00   v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[0:1], v[2:3], v[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     2.00   v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[0:1], v[2:3], a[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[2:3], v[4:5], v[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[4:5], a[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[0:1], v[2:3], v[34:65]
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[0:1], v[2:3], a[34:65]
-# CHECK-NEXT: -      -      -      -      -      -     2.00   v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     2.00   v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, v[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, v[34:65] blgp:7
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[34:65] blgp:7
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     2.00   v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, v[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     2.00   v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v1, a[2:5]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, v[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[18:33]
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_i32_32x32x4_2b_i8 v[0:31], v0, v1, v[34:65]
-# CHECK-NEXT: -      -      -      -      -      -     16.00  v_mfma_i32_32x32x4_2b_i8 a[0:31], v0, v1, a[34:65]
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_smfmac_f32_16x16x32_f16 v[10:13], a[2:3], v[4:7], v0 cbsz:3 abid:1
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_smfmac_f32_16x16x32_f16 a[10:13], v[2:3], a[4:7], v1
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_smfmac_f32_32x32x16_f16 v[10:25], a[2:3], v[4:7], v2 cbsz:3 abid:1
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_smfmac_f32_32x32x16_f16 a[10:25], v[2:3], a[4:7], v3
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_smfmac_f32_16x16x32_bf16 v[10:13], a[2:3], v[4:7], v4 cbsz:3 abid:1
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_smfmac_f32_16x16x32_bf16 a[10:13], v[2:3], a[4:7], v5
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_smfmac_i32_16x16x64_i8 v[10:13], a[2:3], v[4:7], v8 cbsz:3 abid:1
-# CHECK-NEXT: -      -      -      -      -      -     4.00   v_smfmac_i32_16x16x64_i8 a[10:13], v[2:3], a[4:7], v9
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_smfmac_i32_32x32x32_i8 v[10:25], a[2:3], v[4:7], v10 cbsz:3 abid:1
-# CHECK-NEXT: -      -      -      -      -      -     8.00   v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11

From 20fe83bc852e15b2928a168635972a5d29689a49 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 29 Feb 2024 12:02:06 +0000
Subject: [PATCH 04/55] [AMDGPU] Add new aliases ds_subrev_rtn_u32/u64 for
 ds_rsub_rtn_u32/u64 (#83408)

Following on from #83118, this adds aliases for the "rtn" forms of these
instructions. The fact that they were missing from SP3 was an oversight
which has been fixed now.
---
 llvm/lib/Target/AMDGPU/DSInstructions.td |  2 ++
 llvm/test/MC/AMDGPU/gfx11_unsupported.s  | 12 ++++++++++++
 llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s |  6 ++++++
 3 files changed, 20 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index e6d27c2e64690d..7d79b9bba243cf 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1262,7 +1262,9 @@ defm DS_PK_ADD_RTN_BF16   : DS_Real_gfx12<0x0ab>;
 
 // New aliases added in GFX12 without renaming the instructions.
 def : MnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">, Requires<[isGFX12Plus]>;
+def : MnemonicAlias<"ds_subrev_rtn_u32", "ds_rsub_rtn_u32">, Requires<[isGFX12Plus]>;
 def : MnemonicAlias<"ds_subrev_u64", "ds_rsub_u64">, Requires<[isGFX12Plus]>;
+def : MnemonicAlias<"ds_subrev_rtn_u64", "ds_rsub_rtn_u64">, Requires<[isGFX12Plus]>;
 
 //===----------------------------------------------------------------------===//
 // GFX11.
diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
index bfca71ae3a01ef..f447263c30223d 100644
--- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
@@ -2052,3 +2052,15 @@ global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:64
 
 global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_subrev_u32 v1, v2
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_subrev_rtn_u32 v5, v1, v2
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_subrev_u64 v1, v[2:3]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_subrev_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
index aa063c8800aa41..057e99330bcaef 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
@@ -27,5 +27,11 @@ ds_min_rtn_f64 v[5:6], v1, v[2:3]
 ds_subrev_u32 v1, v2
 // GFX12: ds_rsub_u32 v1, v2                      ; encoding: [0x00,0x00,0x08,0xd8,0x01,0x02,0x00,0x00]
 
+ds_subrev_rtn_u32 v5, v1, v2
+// GFX12: ds_rsub_rtn_u32 v5, v1, v2              ; encoding: [0x00,0x00,0x88,0xd8,0x01,0x02,0x00,0x05]
+
 ds_subrev_u64 v1, v[2:3]
 // GFX12: ds_rsub_u64 v1, v[2:3]                  ; encoding: [0x00,0x00,0x08,0xd9,0x01,0x02,0x00,0x00]
+
+ds_subrev_rtn_u64 v[5:6], v1, v[2:3]
+// GFX12: ds_rsub_rtn_u64 v[5:6], v1, v[2:3]      ; encoding: [0x00,0x00,0x88,0xd9,0x01,0x02,0x00,0x05]

From c57002db6afcd2474247d066fae71f696ce31947 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Thu, 29 Feb 2024 13:15:41 +0100
Subject: [PATCH 05/55] Verifier: More helpful error message for cross-function
 references (#82906)

This came up on Discourse.

See:
https://discourse.llvm.org/t/module-verification-failed-instruction-does-not-dominate-all-uses/77207/
---
 llvm/lib/IR/Verifier.cpp           |  4 +++-
 llvm/unittests/IR/VerifierTest.cpp | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3741e5deaa4cd1..e0de179e57146f 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5002,7 +5002,9 @@ void Verifier::visitInstruction(Instruction &I) {
     } else if (GlobalValue *GV = dyn_cast<GlobalValue>(I.getOperand(i))) {
       Check(GV->getParent() == &M, "Referencing global in another module!", &I,
             &M, GV, GV->getParent());
-    } else if (isa<Instruction>(I.getOperand(i))) {
+    } else if (Instruction *OpInst = dyn_cast<Instruction>(I.getOperand(i))) {
+      Check(OpInst->getFunction() == BB->getParent(),
+            "Referring to an instruction in another function!", &I);
       verifyDominatesUse(I, i);
     } else if (isa<InlineAsm>(I.getOperand(i))) {
       Check(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i),
diff --git a/llvm/unittests/IR/VerifierTest.cpp b/llvm/unittests/IR/VerifierTest.cpp
index 31e3b9dfab4bfd..b2cd71e6a38568 100644
--- a/llvm/unittests/IR/VerifierTest.cpp
+++ b/llvm/unittests/IR/VerifierTest.cpp
@@ -339,5 +339,33 @@ TEST(VerifierTest, SwitchInst) {
   EXPECT_TRUE(verifyFunction(*F));
 }
 
+TEST(VerifierTest, CrossFunctionRef) {
+  LLVMContext C;
+  Module M("M", C);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg=*/false);
+  Function *F1 = Function::Create(FTy, Function::ExternalLinkage, "foo1", M);
+  Function *F2 = Function::Create(FTy, Function::ExternalLinkage, "foo2", M);
+  BasicBlock *Entry1 = BasicBlock::Create(C, "entry", F1);
+  BasicBlock *Entry2 = BasicBlock::Create(C, "entry", F2);
+  Type *I32 = Type::getInt32Ty(C);
+
+  Value *Alloca = new AllocaInst(I32, 0, "alloca", Entry1);
+  ReturnInst::Create(C, Entry1);
+
+  Instruction *Store = new StoreInst(ConstantInt::get(I32, 0), Alloca, Entry2);
+  ReturnInst::Create(C, Entry2);
+
+  std::string Error;
+  raw_string_ostream ErrorOS(Error);
+  EXPECT_TRUE(verifyModule(M, &ErrorOS));
+  EXPECT_TRUE(
+      StringRef(ErrorOS.str())
+          .starts_with("Referring to an instruction in another function!"));
+
+  // Explicitly erase the store to avoid a use-after-free when the module is
+  // destroyed.
+  Store->eraseFromParent();
+}
+
 } // end anonymous namespace
 } // end namespace llvm

From b50b50bfbf55b883dcd290027cfd6ca5904b0495 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 29 Feb 2024 10:39:32 +0000
Subject: [PATCH 06/55] [X86] cmov-fp.ll - regenerate with common 'NOSSE'
 prefix to reduce duplication

---
 llvm/test/CodeGen/X86/cmov-fp.ll | 548 ++++++++++---------------------
 1 file changed, 178 insertions(+), 370 deletions(-)

diff --git a/llvm/test/CodeGen/X86/cmov-fp.ll b/llvm/test/CodeGen/X86/cmov-fp.ll
index 26e720ffcebccd..77665d083b7e3e 100644
--- a/llvm/test/CodeGen/X86/cmov-fp.ll
+++ b/llvm/test/CodeGen/X86/cmov-fp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i686-- -mcpu pentium4 < %s | FileCheck %s -check-prefix=SSE
-; RUN: llc -mtriple=i686-- -mcpu pentium3 < %s | FileCheck %s -check-prefix=NOSSE2
-; RUN: llc -mtriple=i686-- -mcpu pentium2 < %s | FileCheck %s -check-prefix=NOSSE1
+; RUN: llc -mtriple=i686-- -mcpu pentium3 < %s | FileCheck %s -check-prefixes=NOSSE,NOSSE2
+; RUN: llc -mtriple=i686-- -mcpu pentium2 < %s | FileCheck %s -check-prefixes=NOSSE,NOSSE1
 ; RUN: llc -mtriple=i686-- -mcpu pentium < %s | FileCheck %s -check-prefix=NOCMOV
 ; PR14035
 
@@ -27,27 +27,16 @@ define double @test1(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    popl %ebp
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test1:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovnbe %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test1:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovnbe %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test1:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovnbe %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test1:
 ; NOCMOV:       # %bb.0:
@@ -90,27 +79,16 @@ define double @test2(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    popl %ebp
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test2:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovnb %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test2:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovnb %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test2:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovnb %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test2:
 ; NOCMOV:       # %bb.0:
@@ -153,27 +131,16 @@ define double @test3(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    popl %ebp
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test3:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovb %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test3:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovb %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test3:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovb %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test3:
 ; NOCMOV:       # %bb.0:
@@ -216,27 +183,16 @@ define double @test4(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    popl %ebp
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test4:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovbe %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test4:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovbe %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test4:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovbe %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test4:
 ; NOCMOV:       # %bb.0:
@@ -279,31 +235,18 @@ define double @test5(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    popl %ebp
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test5:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    setg %al
-; NOSSE2-NEXT:    testb %al, %al
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovne %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test5:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    setg %al
-; NOSSE1-NEXT:    testb %al, %al
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovne %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test5:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    setg %al
+; NOSSE-NEXT:    testb %al, %al
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovne %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test5:
 ; NOCMOV:       # %bb.0:
@@ -346,31 +289,18 @@ define double @test6(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    popl %ebp
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test6:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    setge %al
-; NOSSE2-NEXT:    testb %al, %al
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovne %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test6:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    setge %al
-; NOSSE1-NEXT:    testb %al, %al
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovne %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test6:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    setge %al
+; NOSSE-NEXT:    testb %al, %al
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovne %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test6:
 ; NOCMOV:       # %bb.0:
@@ -413,31 +343,18 @@ define double @test7(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    popl %ebp
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test7:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    setl %al
-; NOSSE2-NEXT:    testb %al, %al
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovne %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test7:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    setl %al
-; NOSSE1-NEXT:    testb %al, %al
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovne %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test7:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    setl %al
+; NOSSE-NEXT:    testb %al, %al
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovne %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test7:
 ; NOCMOV:       # %bb.0:
@@ -480,31 +397,18 @@ define double @test8(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    popl %ebp
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test8:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    setle %al
-; NOSSE2-NEXT:    testb %al, %al
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovne %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test8:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldl {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    setle %al
-; NOSSE1-NEXT:    testb %al, %al
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovne %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test8:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    setle %al
+; NOSSE-NEXT:    testb %al, %al
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovne %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test8:
 ; NOCMOV:       # %bb.0:
@@ -1065,27 +969,16 @@ define x86_fp80 @test17(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-NEXT:    fstp %st(1)
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test17:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovnbe %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test17:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovnbe %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test17:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovnbe %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test17:
 ; NOCMOV:       # %bb.0:
@@ -1118,27 +1011,16 @@ define x86_fp80 @test18(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-NEXT:    fstp %st(1)
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test18:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovnb %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test18:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovnb %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test18:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovnb %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test18:
 ; NOCMOV:       # %bb.0:
@@ -1171,27 +1053,16 @@ define x86_fp80 @test19(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-NEXT:    fstp %st(1)
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test19:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovb %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test19:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovb %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test19:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovb %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test19:
 ; NOCMOV:       # %bb.0:
@@ -1224,27 +1095,16 @@ define x86_fp80 @test20(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-NEXT:    fstp %st(1)
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test20:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovbe %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test20:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovbe %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test20:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovbe %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test20:
 ; NOCMOV:       # %bb.0:
@@ -1279,31 +1139,18 @@ define x86_fp80 @test21(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-NEXT:    fstp %st(1)
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test21:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    setg %al
-; NOSSE2-NEXT:    testb %al, %al
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovne %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test21:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    setg %al
-; NOSSE1-NEXT:    testb %al, %al
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovne %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test21:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    setg %al
+; NOSSE-NEXT:    testb %al, %al
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovne %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test21:
 ; NOCMOV:       # %bb.0:
@@ -1339,31 +1186,18 @@ define x86_fp80 @test22(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-NEXT:    fstp %st(1)
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test22:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    setge %al
-; NOSSE2-NEXT:    testb %al, %al
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovne %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test22:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    setge %al
-; NOSSE1-NEXT:    testb %al, %al
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovne %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test22:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    setge %al
+; NOSSE-NEXT:    testb %al, %al
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovne %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test22:
 ; NOCMOV:       # %bb.0:
@@ -1398,31 +1232,18 @@ define x86_fp80 @test23(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-NEXT:    fstp %st(1)
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test23:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    setl %al
-; NOSSE2-NEXT:    testb %al, %al
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovne %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test23:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    setl %al
-; NOSSE1-NEXT:    testb %al, %al
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovne %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test23:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    setl %al
+; NOSSE-NEXT:    testb %al, %al
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovne %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test23:
 ; NOCMOV:       # %bb.0:
@@ -1457,31 +1278,18 @@ define x86_fp80 @test24(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-NEXT:    fstp %st(1)
 ; SSE-NEXT:    retl
 ;
-; NOSSE2-LABEL: test24:
-; NOSSE2:       # %bb.0:
-; NOSSE2-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE2-NEXT:    setle %al
-; NOSSE2-NEXT:    testb %al, %al
-; NOSSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE2-NEXT:    fxch %st(1)
-; NOSSE2-NEXT:    fcmovne %st(1), %st
-; NOSSE2-NEXT:    fstp %st(1)
-; NOSSE2-NEXT:    retl
-;
-; NOSSE1-LABEL: test24:
-; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    fldt {{[0-9]+}}(%esp)
-; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    setle %al
-; NOSSE1-NEXT:    testb %al, %al
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
-; NOSSE1-NEXT:    fcmovne %st(1), %st
-; NOSSE1-NEXT:    fstp %st(1)
-; NOSSE1-NEXT:    retl
+; NOSSE-LABEL: test24:
+; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NOSSE-NEXT:    setle %al
+; NOSSE-NEXT:    testb %al, %al
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fcmovne %st(1), %st
+; NOSSE-NEXT:    fstp %st(1)
+; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test24:
 ; NOCMOV:       # %bb.0:

From 139bcda542514b7a064fe9225014ec4268bb2b65 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 29 Feb 2024 12:33:49 +0000
Subject: [PATCH 07/55] [X86] SimplifyDemandedVectorEltsForTargetNode - add
 basic CVTPH2PS/CVTPS2PH handling

Allows us to peek through the F16 conversion nodes, mainly to simplify shuffles

An easy part of #83414
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   4 +-
 .../test/CodeGen/X86/avx512-insert-extract.ll |   8 +-
 llvm/test/CodeGen/X86/avx512-vec-cmp.ll       |  21 +--
 llvm/test/CodeGen/X86/cvt16.ll                |   1 -
 .../CodeGen/X86/f16c-intrinsics-fast-isel.ll  |   7 +-
 .../X86/fold-int-pow2-with-fmul-or-fdiv.ll    |   4 -
 llvm/test/CodeGen/X86/fp-roundeven.ll         |   1 -
 llvm/test/CodeGen/X86/fpclamptosat_vec.ll     |  86 ++++++------
 llvm/test/CodeGen/X86/half.ll                 |  62 ++++-----
 llvm/test/CodeGen/X86/pr31088.ll              |  12 +-
 llvm/test/CodeGen/X86/pr57340.ll              | 125 +++++++++---------
 llvm/test/CodeGen/X86/prefer-fpext-splat.ll   |   2 -
 .../CodeGen/X86/vector-half-conversions.ll    |  30 ++---
 .../CodeGen/X86/vector-reduce-fmax-nnan.ll    |  12 +-
 .../CodeGen/X86/vector-reduce-fmin-nnan.ll    |  12 +-
 15 files changed, 160 insertions(+), 227 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 93088c7cde938b..d98d914894a3f8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41402,7 +41402,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     break;
   }
   case X86ISD::CVTSI2P:
-  case X86ISD::CVTUI2P: {
+  case X86ISD::CVTUI2P:
+  case X86ISD::CVTPH2PS:
+  case X86ISD::CVTPS2PH: {
     SDValue Src = Op.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
     APInt SrcUndef, SrcZero;
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 22aae4de4db9d2..3e40bfa1e791d0 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -2171,14 +2171,13 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
 ; KNL-LABEL: test_concat_v2i1:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; KNL-NEXT:    vcvtph2ps %xmm1, %xmm1
+; KNL-NEXT:    vcvtph2ps %xmm0, %xmm1
 ; KNL-NEXT:    vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; KNL-NEXT:    vucomiss %xmm2, %xmm1
 ; KNL-NEXT:    setb %al
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; KNL-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; KNL-NEXT:    vucomiss %xmm2, %xmm0
 ; KNL-NEXT:    setb %al
@@ -2207,14 +2206,13 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
 ; SKX-LABEL: test_concat_v2i1:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[1,1,1,1,4,5,6,7]
 ; SKX-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; SKX-NEXT:    vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SKX-NEXT:    vucomiss %xmm2, %xmm1
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    kmovd %eax, %k0
 ; SKX-NEXT:    kshiftlb $1, %k0, %k0
-; SKX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; SKX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; SKX-NEXT:    vucomiss %xmm2, %xmm0
 ; SKX-NEXT:    setb %al
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index f3c728a990f514..86ebb1e40870f8 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1436,9 +1436,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
 ; KNL:       ## %bb.0: ## %entry
 ; KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; KNL-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; KNL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; KNL-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
-; KNL-NEXT:    ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; KNL-NEXT:    vpshuflw $85, %xmm0, %xmm1 ## encoding: [0xc5,0xfb,0x70,0xc8,0x55]
+; KNL-NEXT:    ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
 ; KNL-NEXT:    vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
 ; KNL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; KNL-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
@@ -1448,8 +1447,6 @@ define void @half_vec_compare(ptr %x, ptr %y) {
 ; KNL-NEXT:    movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00]
 ; KNL-NEXT:    cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1]
 ; KNL-NEXT:    cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1]
-; KNL-NEXT:    vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0]
-; KNL-NEXT:    ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; KNL-NEXT:    vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
 ; KNL-NEXT:    vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
 ; KNL-NEXT:    cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1]
@@ -1466,9 +1463,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
 ; AVX512BW:       ## %bb.0: ## %entry
 ; AVX512BW-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512BW-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
-; AVX512BW-NEXT:    ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; AVX512BW-NEXT:    vpshuflw $85, %xmm0, %xmm1 ## encoding: [0xc5,0xfb,0x70,0xc8,0x55]
+; AVX512BW-NEXT:    ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
 ; AVX512BW-NEXT:    vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
 ; AVX512BW-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; AVX512BW-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
@@ -1478,8 +1474,6 @@ define void @half_vec_compare(ptr %x, ptr %y) {
 ; AVX512BW-NEXT:    movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00]
 ; AVX512BW-NEXT:    cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1]
 ; AVX512BW-NEXT:    cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1]
-; AVX512BW-NEXT:    vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0]
-; AVX512BW-NEXT:    ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
 ; AVX512BW-NEXT:    vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
 ; AVX512BW-NEXT:    cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1]
@@ -1496,9 +1490,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
 ; SKX:       ## %bb.0: ## %entry
 ; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SKX-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; SKX-NEXT:    ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
-; SKX-NEXT:    ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; SKX-NEXT:    vpshuflw $85, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xc8,0x55]
+; SKX-NEXT:    ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
 ; SKX-NEXT:    vcvtph2ps %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc9]
 ; SKX-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
 ; SKX-NEXT:    vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca]
@@ -1507,8 +1500,6 @@ define void @half_vec_compare(ptr %x, ptr %y) {
 ; SKX-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
 ; SKX-NEXT:    testb %cl, %cl ## encoding: [0x84,0xc9]
 ; SKX-NEXT:    setne %al ## encoding: [0x0f,0x95,0xc0]
-; SKX-NEXT:    vpmovzxwq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xc0]
-; SKX-NEXT:    ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; SKX-NEXT:    vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
 ; SKX-NEXT:    vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
 ; SKX-NEXT:    setp %cl ## encoding: [0x0f,0x9a,0xc1]
diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll
index 59097f8fb5d247..c7ef353f7f6038 100644
--- a/llvm/test/CodeGen/X86/cvt16.ll
+++ b/llvm/test/CodeGen/X86/cvt16.ll
@@ -89,7 +89,6 @@ define float @test3(float %src) nounwind uwtable readnone {
 ; F16C-LABEL: test3:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
index e114c205d7972b..1886e2911ede80 100644
--- a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
@@ -18,8 +18,7 @@ define float @test_cvtsh_ss(i16 %a0) nounwind {
 ;
 ; X64-LABEL: test_cvtsh_ss:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    vmovd %eax, %xmm0
+; X64-NEXT:    vmovd %edi, %xmm0
 ; X64-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
   %ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0
@@ -41,8 +40,6 @@ define i16 @test_cvtss_sh(float %a0) nounwind {
 ; X86-LABEL: test_cvtss_sh:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-NEXT:    vcvtps2ph $0, %xmm0, %xmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -50,8 +47,6 @@ define i16 @test_cvtss_sh(float %a0) nounwind {
 ;
 ; X64-LABEL: test_cvtss_sh:
 ; X64:       # %bb.0:
-; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-NEXT:    vcvtps2ph $0, %xmm0, %xmm0
 ; X64-NEXT:    vmovd %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 5f326b6d6998fb..8f875c70a25f6d 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1432,7 +1432,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
 ; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
@@ -1447,7 +1446,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
 ; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
@@ -1550,7 +1548,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
 ; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
@@ -1566,7 +1563,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    movzwl %ax, %eax
 ; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll
index fed2060dabd3af..8037c783dd8e67 100644
--- a/llvm/test/CodeGen/X86/fp-roundeven.ll
+++ b/llvm/test/CodeGen/X86/fp-roundeven.ll
@@ -51,7 +51,6 @@ define half @roundeven_f16(half %h) {
 ; AVX512F-LABEL: roundeven_f16:
 ; AVX512F:       ## %bb.0: ## %entry
 ; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT:    movzwl %ax, %eax
 ; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $8, %xmm0, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index a3fb71f817ce47..6aad4c2ebba1d8 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -698,24 +698,23 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) nounwind {
 ;
 ; AVX2-LABEL: stest_f16i32:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rcx
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
 ; AVX2-NEXT:    vmovq %rcx, %xmm1
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm0, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -837,7 +836,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
 ;
 ; AVX2-LABEL: utesth_f16i32:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm2
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
@@ -846,29 +845,28 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
 ; AVX2-NEXT:    andq %rax, %rdx
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    vmovq %rdx, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    vmovq %rdx, %xmm2
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
 ; AVX2-NEXT:    andq %rax, %rdx
-; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm4
+; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
 ; AVX2-NEXT:    vcvttss2si %xmm4, %rax
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm4
-; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm3[0]
+; AVX2-NEXT:    vcvttss2si %xmm3, %rcx
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
 ; AVX2-NEXT:    andq %rax, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
@@ -879,7 +877,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    andq %rax, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
@@ -1001,24 +999,23 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind {
 ;
 ; AVX2-LABEL: ustest_f16i32:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vcvttss2si %xmm0, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -3313,24 +3310,23 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) nounwind {
 ;
 ; AVX2-LABEL: stest_f16i32_mm:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rcx
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
 ; AVX2-NEXT:    vmovq %rcx, %xmm1
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm0, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -3450,7 +3446,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
 ;
 ; AVX2-LABEL: utesth_f16i32_mm:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm2
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
@@ -3459,29 +3455,28 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
 ; AVX2-NEXT:    andq %rax, %rdx
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    vmovq %rdx, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    vmovq %rdx, %xmm2
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
 ; AVX2-NEXT:    andq %rax, %rdx
-; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm4
+; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
 ; AVX2-NEXT:    vcvttss2si %xmm4, %rax
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm4
-; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm3[0]
+; AVX2-NEXT:    vcvttss2si %xmm3, %rcx
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
 ; AVX2-NEXT:    andq %rax, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
@@ -3492,7 +3487,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    andq %rax, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm1
@@ -3613,24 +3608,23 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind {
 ;
 ; AVX2-LABEL: ustest_f16i32_mm:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vcvttss2si %xmm0, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 2e1322446032ff..9f01d07e6a6705 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -851,16 +851,14 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 {
 ;
 ; BWON-F16C-LABEL: test_sitofp_fadd_i32:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    movzwl (%rsi), %eax
 ; BWON-F16C-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT:    movzwl (%rsi), %eax
 ; BWON-F16C-NEXT:    vmovd %eax, %xmm1
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; BWON-F16C-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    retq
 ;
@@ -919,7 +917,6 @@ define half @PR40273(half) #0 {
 ; BWON-F16C-LABEL: PR40273:
 ; BWON-F16C:       # %bb.0:
 ; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
 ; BWON-F16C-NEXT:    vmovd %eax, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    xorl %eax, %eax
@@ -973,7 +970,6 @@ define void @brcond(half %0) #0 {
 ; BWON-F16C-LABEL: brcond:
 ; BWON-F16C:       # %bb.0: # %entry
 ; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
 ; BWON-F16C-NEXT:    vmovd %eax, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
@@ -1029,7 +1025,6 @@ define half @test_sqrt(half %0) #0 {
 ; BWON-F16C-LABEL: test_sqrt:
 ; BWON-F16C:       # %bb.0: # %entry
 ; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
 ; BWON-F16C-NEXT:    vmovd %eax, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
@@ -1083,7 +1078,6 @@ define void @main.158() #0 {
 ; BWON-F16C:       # %bb.0: # %entry
 ; BWON-F16C-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm2 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm2
@@ -1172,8 +1166,7 @@ define void @main.45() #0 {
 ;
 ; BWON-F16C-LABEL: main.45:
 ; BWON-F16C:       # %bb.0: # %entry
-; BWON-F16C-NEXT:    movzwl (%rax), %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
+; BWON-F16C-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; BWON-F16C-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    xorl %eax, %eax
@@ -1345,10 +1338,8 @@ define half @pr61271(half %0, half %1) #0 {
 ; BWON-F16C:       # %bb.0:
 ; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
 ; BWON-F16C-NEXT:    vpextrw $0, %xmm1, %ecx
-; BWON-F16C-NEXT:    movzwl %cx, %ecx
 ; BWON-F16C-NEXT:    vmovd %ecx, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT:    movzwl %ax, %eax
 ; BWON-F16C-NEXT:    vmovd %eax, %xmm1
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; BWON-F16C-NEXT:    vminss %xmm0, %xmm1, %xmm0
@@ -1614,10 +1605,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ;
 ; BWON-F16C-LABEL: maxnum_v8f16:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,11,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; BWON-F16C-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
+; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
+; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
 ; BWON-F16C-NEXT:    vucomiss %xmm2, %xmm3
 ; BWON-F16C-NEXT:    ja .LBB26_2
@@ -1625,10 +1615,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:    vmovaps %xmm2, %xmm3
 ; BWON-F16C-NEXT:  .LBB26_2:
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm3, %xmm2
-; BWON-F16C-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; BWON-F16C-NEXT:    vpshufb %xmm4, %xmm1, %xmm3
+; BWON-F16C-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT:    vpshufb %xmm4, %xmm0, %xmm4
+; BWON-F16C-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm4, %xmm4
 ; BWON-F16C-NEXT:    vucomiss %xmm3, %xmm4
 ; BWON-F16C-NEXT:    ja .LBB26_4
@@ -1638,10 +1627,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:    vmovd %xmm2, %eax
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm4, %xmm2
 ; BWON-F16C-NEXT:    vmovd %xmm2, %ecx
-; BWON-F16C-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; BWON-F16C-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
-; BWON-F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm3
+; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; BWON-F16C-NEXT:    vucomiss %xmm3, %xmm2
 ; BWON-F16C-NEXT:    ja .LBB26_6
@@ -1650,9 +1638,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:  .LBB26_6:
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
 ; BWON-F16C-NEXT:    vmovd %xmm2, %edx
-; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; BWON-F16C-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm3
-; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; BWON-F16C-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; BWON-F16C-NEXT:    vucomiss %xmm3, %xmm2
 ; BWON-F16C-NEXT:    ja .LBB26_8
@@ -1661,10 +1649,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:  .LBB26_8:
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
 ; BWON-F16C-NEXT:    vmovd %xmm2, %esi
-; BWON-F16C-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; BWON-F16C-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
+; BWON-F16C-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
+; BWON-F16C-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm3, %xmm6
 ; BWON-F16C-NEXT:    vucomiss %xmm2, %xmm6
 ; BWON-F16C-NEXT:    ja .LBB26_10
@@ -1677,9 +1664,9 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:    vpinsrw $0, %esi, %xmm0, %xmm5
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
 ; BWON-F16C-NEXT:    vmovd %xmm6, %eax
-; BWON-F16C-NEXT:    vpsrlq $48, %xmm1, %xmm6
+; BWON-F16C-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm6, %xmm7
-; BWON-F16C-NEXT:    vpsrlq $48, %xmm0, %xmm6
+; BWON-F16C-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm6, %xmm6
 ; BWON-F16C-NEXT:    vucomiss %xmm7, %xmm6
 ; BWON-F16C-NEXT:    ja .LBB26_12
@@ -1687,29 +1674,26 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:    vmovaps %xmm7, %xmm6
 ; BWON-F16C-NEXT:  .LBB26_12:
 ; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm6, %xmm5
 ; BWON-F16C-NEXT:    vmovd %xmm5, %eax
 ; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; BWON-F16C-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; BWON-F16C-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
-; BWON-F16C-NEXT:    vcvtph2ps %xmm7, %xmm7
-; BWON-F16C-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
-; BWON-F16C-NEXT:    vcvtph2ps %xmm6, %xmm6
+; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm7
+; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm6
 ; BWON-F16C-NEXT:    vucomiss %xmm7, %xmm6
 ; BWON-F16C-NEXT:    ja .LBB26_14
 ; BWON-F16C-NEXT:  # %bb.13:
 ; BWON-F16C-NEXT:    vmovaps %xmm7, %xmm6
 ; BWON-F16C-NEXT:  .LBB26_14:
-; BWON-F16C-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; BWON-F16C-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm6, %xmm4
 ; BWON-F16C-NEXT:    vmovd %xmm4, %eax
 ; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; BWON-F16C-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BWON-F16C-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm0
 ; BWON-F16C-NEXT:    ja .LBB26_16
@@ -1719,7 +1703,7 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vmovd %xmm0, %eax
 ; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
 ; BWON-F16C-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; BWON-F16C-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; BWON-F16C-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll
index a21653bc7330c9..ce37622c476db4 100644
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -41,9 +41,7 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
 ;
 ; F16C-LABEL: ir_fadd_v1f16:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -54,13 +52,15 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
 ; F16C-O0-LABEL: ir_fadd_v1f16:
 ; F16C-O0:       # %bb.0:
 ; F16C-O0-NEXT:    vpextrw $0, %xmm1, %eax
-; F16C-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; F16C-O0-NEXT:    movzwl %ax, %eax
+; F16C-O0-NEXT:    movw %ax, %cx
+; F16C-O0-NEXT:    # implicit-def: $eax
+; F16C-O0-NEXT:    movw %cx, %ax
 ; F16C-O0-NEXT:    vmovd %eax, %xmm1
 ; F16C-O0-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; F16C-O0-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; F16C-O0-NEXT:    movzwl %ax, %eax
+; F16C-O0-NEXT:    movw %ax, %cx
+; F16C-O0-NEXT:    # implicit-def: $eax
+; F16C-O0-NEXT:    movw %cx, %ax
 ; F16C-O0-NEXT:    vmovd %eax, %xmm0
 ; F16C-O0-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-O0-NEXT:    vaddss %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll
index 95f839c338e701..00a52c639e43c6 100644
--- a/llvm/test/CodeGen/X86/pr57340.ll
+++ b/llvm/test/CodeGen/X86/pr57340.ll
@@ -5,29 +5,28 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-LABEL: main.41:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpbroadcastw (%rax), %xmm0
-; CHECK-NEXT:    vmovdqu (%rax), %ymm1
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    vmovdqu (%rax), %xmm10
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; CHECK-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
-; CHECK-NEXT:    vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT:    vpshufb %xmm1, %xmm3, %xmm4
-; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm4
-; CHECK-NEXT:    vucomiss %xmm4, %xmm2
-; CHECK-NEXT:    setnp %al
-; CHECK-NEXT:    sete %cl
-; CHECK-NEXT:    testb %al, %cl
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    kmovd %eax, %k0
+; CHECK-NEXT:    vpextrw $0, %xmm0, %eax
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
+; CHECK-NEXT:    vmovdqu (%rax), %ymm3
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT:    vpermi2w %ymm1, %ymm3, %ymm2
+; CHECK-NEXT:    vprold $16, %xmm2, %xmm1
+; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm3
+; CHECK-NEXT:    vmovdqu (%rax), %xmm5
+; CHECK-NEXT:    vprold $16, %xmm5, %xmm1
+; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT:    vucomiss %xmm3, %xmm1
+; CHECK-NEXT:    setnp %cl
+; CHECK-NEXT:    sete %dl
+; CHECK-NEXT:    testb %cl, %dl
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    kmovd %ecx, %k0
 ; CHECK-NEXT:    kshiftlw $15, %k0, %k0
+; CHECK-NEXT:    vmovd %eax, %xmm3
+; CHECK-NEXT:    vcvtph2ps %xmm3, %xmm3
+; CHECK-NEXT:    vcvtph2ps %xmm5, %xmm6
 ; CHECK-NEXT:    kshiftrw $14, %k0, %k0
-; CHECK-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm11
-; CHECK-NEXT:    vucomiss %xmm0, %xmm11
+; CHECK-NEXT:    vucomiss %xmm3, %xmm6
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -38,10 +37,10 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-5, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,5,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; CHECK-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
-; CHECK-NEXT:    vcvtph2ps %xmm5, %xmm5
-; CHECK-NEXT:    vucomiss %xmm5, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-NEXT:    vcvtph2ps %xmm3, %xmm3
+; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-NEXT:    vucomiss %xmm3, %xmm0
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -52,12 +51,12 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-9, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpsrlq $48, %xmm3, %xmm5
-; CHECK-NEXT:    vcvtph2ps %xmm5, %xmm6
-; CHECK-NEXT:    vpsrlq $48, %xmm10, %xmm5
-; CHECK-NEXT:    vcvtph2ps %xmm5, %xmm5
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vucomiss %xmm6, %xmm5
+; CHECK-NEXT:    vprolq $16, %xmm2, %xmm3
+; CHECK-NEXT:    vcvtph2ps %xmm3, %xmm4
+; CHECK-NEXT:    vprolq $16, %xmm5, %xmm3
+; CHECK-NEXT:    vcvtph2ps %xmm3, %xmm3
+; CHECK-NEXT:    vucomiss %xmm4, %xmm3
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -68,11 +67,10 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-17, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
-; CHECK-NEXT:    vcvtph2ps %xmm7, %xmm7
-; CHECK-NEXT:    vucomiss %xmm7, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm4
+; CHECK-NEXT:    vucomiss %xmm4, %xmm0
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -83,13 +81,12 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-33, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; CHECK-NEXT:    vpshufb %xmm7, %xmm10, %xmm8
-; CHECK-NEXT:    vcvtph2ps %xmm8, %xmm8
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm7
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm4
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpshufb %xmm7, %xmm3, %xmm9
-; CHECK-NEXT:    vcvtph2ps %xmm9, %xmm9
-; CHECK-NEXT:    vucomiss %xmm9, %xmm8
+; CHECK-NEXT:    vucomiss %xmm7, %xmm4
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -100,11 +97,10 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-65, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm9 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; CHECK-NEXT:    vpshufb %xmm9, %xmm3, %xmm12
-; CHECK-NEXT:    vcvtph2ps %xmm12, %xmm12
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vucomiss %xmm12, %xmm0
+; CHECK-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
+; CHECK-NEXT:    vcvtph2ps %xmm7, %xmm7
+; CHECK-NEXT:    vucomiss %xmm7, %xmm0
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -116,11 +112,11 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-129, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm12 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vcvtph2ps %xmm12, %xmm12
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm10 = xmm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vcvtph2ps %xmm10, %xmm10
-; CHECK-NEXT:    vucomiss %xmm12, %xmm10
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vcvtph2ps %xmm7, %xmm7
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vcvtph2ps %xmm5, %xmm5
+; CHECK-NEXT:    vucomiss %xmm7, %xmm5
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -131,11 +127,10 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-257, %ax # imm = 0xFEFF
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; CHECK-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; CHECK-NEXT:    vcvtph2ps %xmm12, %xmm12
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; CHECK-NEXT:    vcvtph2ps %xmm2, %xmm7
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vucomiss %xmm12, %xmm11
+; CHECK-NEXT:    vucomiss %xmm7, %xmm6
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -147,9 +142,9 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-513, %ax # imm = 0xFDFF
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT:    vucomiss %xmm1, %xmm2
+; CHECK-NEXT:    vprold $16, %xmm2, %xmm6
+; CHECK-NEXT:    vcvtph2ps %xmm6, %xmm6
+; CHECK-NEXT:    vucomiss %xmm6, %xmm1
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -161,7 +156,7 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-1025, %ax # imm = 0xFBFF
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setnp %al
@@ -175,9 +170,9 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-2049, %ax # imm = 0xF7FF
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpsrlq $48, %xmm3, %xmm1
+; CHECK-NEXT:    vprolq $16, %xmm2, %xmm1
 ; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT:    vucomiss %xmm1, %xmm5
+; CHECK-NEXT:    vucomiss %xmm1, %xmm3
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -189,7 +184,7 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-4097, %ax # imm = 0xEFFF
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpshufb %xmm6, %xmm3, %xmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
 ; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setnp %al
@@ -203,9 +198,9 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-8193, %ax # imm = 0xDFFF
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpshufb %xmm7, %xmm3, %xmm1
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
-; CHECK-NEXT:    vucomiss %xmm1, %xmm8
+; CHECK-NEXT:    vucomiss %xmm1, %xmm4
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -216,7 +211,7 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-16385, %ax # imm = 0xBFFF
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb %xmm9, %xmm3, %xmm1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3]
 ; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
@@ -228,10 +223,10 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    kshiftlw $14, %k1, %k1
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    kshiftlw $1, %k0, %k0
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-NEXT:    kshiftrw $1, %k0, %k0
-; CHECK-NEXT:    vucomiss %xmm0, %xmm10
+; CHECK-NEXT:    vucomiss %xmm0, %xmm5
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
diff --git a/llvm/test/CodeGen/X86/prefer-fpext-splat.ll b/llvm/test/CodeGen/X86/prefer-fpext-splat.ll
index 1d8b8b3f9a96ec..c3d7b2e15d0170 100644
--- a/llvm/test/CodeGen/X86/prefer-fpext-splat.ll
+++ b/llvm/test/CodeGen/X86/prefer-fpext-splat.ll
@@ -176,8 +176,6 @@ define <2 x double> @prefer_f16_v2f64(ptr %p) nounwind {
 ; AVX512F-LABEL: prefer_f16_v2f64:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vpbroadcastw (%rdi), %xmm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvtps2pd %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 3b82df5d5b74d2..ba21af231985a1 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -21,15 +21,13 @@ define float @cvt_i16_to_f32(i16 %a0) nounwind {
 ;
 ; F16C-LABEL: cvt_i16_to_f32:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    movzwl %di, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vmovd %edi, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
 ; AVX512-LABEL: cvt_i16_to_f32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movzwl %di, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vmovd %edi, %xmm0
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = bitcast i16 %a0 to half
@@ -1370,16 +1368,14 @@ define double @cvt_i16_to_f64(i16 %a0) nounwind {
 ;
 ; F16C-LABEL: cvt_i16_to_f64:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    movzwl %di, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vmovd %edi, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
 ; AVX512-LABEL: cvt_i16_to_f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movzwl %di, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vmovd %edi, %xmm0
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -1410,14 +1406,12 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
 ;
 ; F16C-LABEL: cvt_2i16_to_2f64:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
 ; AVX512-LABEL: cvt_2i16_to_2f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -1503,14 +1497,12 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
 ;
 ; F16C-LABEL: cvt_8i16_to_2f64:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
 ; AVX512-LABEL: cvt_8i16_to_2f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -1877,16 +1869,14 @@ define <2 x double> @load_cvt_2i16_to_2f64(ptr %a0) nounwind {
 ;
 ; F16C-LABEL: load_cvt_2i16_to_2f64:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; F16C-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; F16C-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
 ; AVX512-LABEL: load_cvt_2i16_to_2f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -4976,9 +4966,9 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
 ;
 ; F16C-LABEL: fptosi_2f16_to_4i32:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; F16C-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; F16C-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; F16C-NEXT:    vcvttps2dq %xmm0, %xmm0
@@ -4987,9 +4977,9 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
 ;
 ; AVX512-LABEL: fptosi_2f16_to_4i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index 6e8eefc607ee11..24113441a4e25a 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -413,10 +413,8 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm2
+; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm3
 ; AVX512F-NEXT:    xorl %eax, %eax
 ; AVX512F-NEXT:    vucomiss %xmm3, %xmm2
 ; AVX512F-NEXT:    movl $255, %ecx
@@ -430,10 +428,8 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512VL-LABEL: test_v2f16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm2
+; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm3
 ; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vucomiss %xmm3, %xmm2
 ; AVX512VL-NEXT:    movl $255, %ecx
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
index 804ca183ad4c9d..edefb16d40e6ed 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
@@ -412,10 +412,8 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm2
+; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm3
 ; AVX512F-NEXT:    xorl %eax, %eax
 ; AVX512F-NEXT:    vucomiss %xmm3, %xmm2
 ; AVX512F-NEXT:    movl $255, %ecx
@@ -429,10 +427,8 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512VL-LABEL: test_v2f16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm2
+; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm3
 ; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vucomiss %xmm3, %xmm2
 ; AVX512VL-NEXT:    movl $255, %ecx

From 4d525f2b9a42fee4bd3a3c45873fc38b35dd8004 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 29 Feb 2024 12:37:12 +0000
Subject: [PATCH 08/55] [VPlan] Remove unneeded InsertPointGuard (NFCI).

getBlockInMask now simply returns an already computed mask, hence
there's no need to adjust the builder insert point.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b81cd508663c32..50a073e890626e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9003,11 +9003,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
 
       BasicBlock *BB = CurrentLinkI->getParent();
       VPValue *CondOp = nullptr;
-      if (CM.blockNeedsPredicationForAnyReason(BB)) {
-        VPBuilder::InsertPointGuard Guard(Builder);
-        Builder.setInsertPoint(CurrentLink);
+      if (CM.blockNeedsPredicationForAnyReason(BB))
         CondOp = RecipeBuilder.getBlockInMask(BB);
-      }
 
       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);

From 23f3651c7c5653781249ca0f87809dde76ceeac7 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 29 Feb 2024 06:51:01 -0600
Subject: [PATCH 09/55] [libc] Remove workaround for fmin / fmax after being
 fixed in LLVM (#83421)

Summary:
These hacks can be removed now that
https://github.com/llvm/llvm-project/pull/83376 fixed the underlying
problem.
---
 libc/src/math/amdgpu/fmax.cpp  | 4 ----
 libc/src/math/amdgpu/fmaxf.cpp | 6 ------
 libc/src/math/amdgpu/fmin.cpp  | 6 ------
 libc/src/math/amdgpu/fminf.cpp | 6 ------
 libc/src/math/nvptx/fmax.cpp   | 6 ------
 libc/src/math/nvptx/fmaxf.cpp  | 4 ----
 libc/src/math/nvptx/fmin.cpp   | 6 ------
 libc/src/math/nvptx/fminf.cpp  | 6 ------
 8 files changed, 44 deletions(-)

diff --git a/libc/src/math/amdgpu/fmax.cpp b/libc/src/math/amdgpu/fmax.cpp
index 09624cc6f092af..09f0f942a042a4 100644
--- a/libc/src/math/amdgpu/fmax.cpp
+++ b/libc/src/math/amdgpu/fmax.cpp
@@ -15,10 +15,6 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fmax, (double x, double y)) {
-  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
-  if (LIBC_UNLIKELY(x == y))
-    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) &
-                                 cpp::bit_cast<uint64_t>(y));
   return __builtin_fmax(x, y);
 }
 
diff --git a/libc/src/math/amdgpu/fmaxf.cpp b/libc/src/math/amdgpu/fmaxf.cpp
index f6ed46699a049f..5913a85df63703 100644
--- a/libc/src/math/amdgpu/fmaxf.cpp
+++ b/libc/src/math/amdgpu/fmaxf.cpp
@@ -8,17 +8,11 @@
 
 #include "src/math/fmaxf.h"
 
-#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
-#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fmaxf, (float x, float y)) {
-  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
-  if (LIBC_UNLIKELY(x == y))
-    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) &
-                                cpp::bit_cast<uint32_t>(y));
   return __builtin_fmaxf(x, y);
 }
 
diff --git a/libc/src/math/amdgpu/fmin.cpp b/libc/src/math/amdgpu/fmin.cpp
index 8977ff7a066c6b..0d6f3521dcb705 100644
--- a/libc/src/math/amdgpu/fmin.cpp
+++ b/libc/src/math/amdgpu/fmin.cpp
@@ -8,17 +8,11 @@
 
 #include "src/math/fmin.h"
 
-#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
-#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fmin, (double x, double y)) {
-  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
-  if (LIBC_UNLIKELY(x == y))
-    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) |
-                                 cpp::bit_cast<uint64_t>(y));
   return __builtin_fmin(x, y);
 }
 
diff --git a/libc/src/math/amdgpu/fminf.cpp b/libc/src/math/amdgpu/fminf.cpp
index 3be55257f61649..42744abfb3b02f 100644
--- a/libc/src/math/amdgpu/fminf.cpp
+++ b/libc/src/math/amdgpu/fminf.cpp
@@ -8,17 +8,11 @@
 
 #include "src/math/fminf.h"
 
-#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
-#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fminf, (float x, float y)) {
-  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
-  if (LIBC_UNLIKELY(x == y))
-    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) |
-                                cpp::bit_cast<uint32_t>(y));
   return __builtin_fminf(x, y);
 }
 
diff --git a/libc/src/math/nvptx/fmax.cpp b/libc/src/math/nvptx/fmax.cpp
index 09624cc6f092af..3ba65d7eccd369 100644
--- a/libc/src/math/nvptx/fmax.cpp
+++ b/libc/src/math/nvptx/fmax.cpp
@@ -8,17 +8,11 @@
 
 #include "src/math/fmax.h"
 
-#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
-#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fmax, (double x, double y)) {
-  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
-  if (LIBC_UNLIKELY(x == y))
-    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) &
-                                 cpp::bit_cast<uint64_t>(y));
   return __builtin_fmax(x, y);
 }
 
diff --git a/libc/src/math/nvptx/fmaxf.cpp b/libc/src/math/nvptx/fmaxf.cpp
index f6ed46699a049f..e977082b39f403 100644
--- a/libc/src/math/nvptx/fmaxf.cpp
+++ b/libc/src/math/nvptx/fmaxf.cpp
@@ -15,10 +15,6 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fmaxf, (float x, float y)) {
-  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
-  if (LIBC_UNLIKELY(x == y))
-    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) &
-                                cpp::bit_cast<uint32_t>(y));
   return __builtin_fmaxf(x, y);
 }
 
diff --git a/libc/src/math/nvptx/fmin.cpp b/libc/src/math/nvptx/fmin.cpp
index 8977ff7a066c6b..0d6f3521dcb705 100644
--- a/libc/src/math/nvptx/fmin.cpp
+++ b/libc/src/math/nvptx/fmin.cpp
@@ -8,17 +8,11 @@
 
 #include "src/math/fmin.h"
 
-#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
-#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fmin, (double x, double y)) {
-  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
-  if (LIBC_UNLIKELY(x == y))
-    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) |
-                                 cpp::bit_cast<uint64_t>(y));
   return __builtin_fmin(x, y);
 }
 
diff --git a/libc/src/math/nvptx/fminf.cpp b/libc/src/math/nvptx/fminf.cpp
index 3be55257f61649..42744abfb3b02f 100644
--- a/libc/src/math/nvptx/fminf.cpp
+++ b/libc/src/math/nvptx/fminf.cpp
@@ -8,17 +8,11 @@
 
 #include "src/math/fminf.h"
 
-#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
-#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fminf, (float x, float y)) {
-  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
-  if (LIBC_UNLIKELY(x == y))
-    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) |
-                                cpp::bit_cast<uint32_t>(y));
   return __builtin_fminf(x, y);
 }
 

From dbca8a49b6dbbb79913d6a1bc1d59f4947353e96 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 29 Feb 2024 12:53:13 +0000
Subject: [PATCH 10/55] [DAG] Improve known bits of Zext/Sext loads with range
 metadata (#80829)

This extends the known bits for extending loads which have range
metadata, handling the range metadata on the original memory type,
extending that to the correct BitWidth.
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 48 +++++++------
 llvm/test/CodeGen/AArch64/setcc_knownbits.ll  |  4 +-
 .../CodeGen/AArch64SelectionDAGTest.cpp       | 68 +++++++++++++++++++
 3 files changed, 98 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e3e7de1573c818..5b1b7c7c627723 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3645,32 +3645,42 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
           }
         }
       }
-    } else if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
-      // If this is a ZEXTLoad and we are looking at the loaded value.
-      EVT VT = LD->getMemoryVT();
-      unsigned MemBits = VT.getScalarSizeInBits();
-      Known.Zero.setBitsFrom(MemBits);
-    } else if (const MDNode *Ranges = LD->getRanges()) {
-      EVT VT = LD->getValueType(0);
-
-      // TODO: Handle for extending loads
-      if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
+    } else if (Op.getResNo() == 0) {
+      KnownBits Known0(!LD->getMemoryVT().isScalableVT()
+                           ? LD->getMemoryVT().getFixedSizeInBits()
+                           : BitWidth);
+      EVT VT = Op.getValueType();
+      // Fill in any known bits from range information. There are 3 types being
+      // used. The results VT (same vector elt size as BitWidth), the loaded
+      // MemoryVT (which may or may not be vector) and the range VTs original
+      // type. The range matadata needs the full range (i.e
+      // MemoryVT().getSizeInBits()), which is truncated to the correct elt size
+      // if it is know. These are then extended to the original VT sizes below.
+      if (const MDNode *MD = LD->getRanges()) {
+        computeKnownBitsFromRangeMetadata(*MD, Known0);
         if (VT.isVector()) {
           // Handle truncation to the first demanded element.
           // TODO: Figure out which demanded elements are covered
           if (DemandedElts != 1 || !getDataLayout().isLittleEndian())
             break;
+          Known0 = Known0.trunc(BitWidth);
+        }
+      }
 
-          // Handle the case where a load has a vector type, but scalar memory
-          // with an attached range.
-          EVT MemVT = LD->getMemoryVT();
-          KnownBits KnownFull(MemVT.getSizeInBits());
+      if (LD->getMemoryVT().isVector())
+        Known0 = Known0.trunc(LD->getMemoryVT().getScalarSizeInBits());
 
-          computeKnownBitsFromRangeMetadata(*Ranges, KnownFull);
-          Known = KnownFull.trunc(BitWidth);
-        } else
-          computeKnownBitsFromRangeMetadata(*Ranges, Known);
-      }
+      // Extend the Known bits from memory to the size of the result.
+      if (ISD::isZEXTLoad(Op.getNode()))
+        Known = Known0.zext(BitWidth);
+      else if (ISD::isSEXTLoad(Op.getNode()))
+        Known = Known0.sext(BitWidth);
+      else if (ISD::isEXTLoad(Op.getNode()))
+        Known = Known0.anyext(BitWidth);
+      else
+        Known = Known0;
+      assert(Known.getBitWidth() == BitWidth);
+      return Known;
     }
     break;
   }
diff --git a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
index 46b714d8e5fbbe..bb9546af8bb7b6 100644
--- a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
+++ b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
@@ -21,9 +21,7 @@ define noundef i1 @logger(i32 noundef %logLevel, ptr %ea, ptr %pll) {
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_2: // %land.rhs
 ; CHECK-NEXT:    ldr x8, [x1]
-; CHECK-NEXT:    ldrb w8, [x8]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldrb w0, [x8]
 ; CHECK-NEXT:    ret
 entry:
   %0 = load i32, ptr %pll, align 4
diff --git a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index bb8e76a2eeb8be..e0772684e3a954 100644
--- a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -6,11 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/SourceMgr.h"
@@ -728,4 +730,70 @@ TEST_F(AArch64SelectionDAGTest, ReplaceAllUsesWith) {
   EXPECT_EQ(DAG->getPCSections(New.getNode()), MD);
 }
 
+TEST_F(AArch64SelectionDAGTest, computeKnownBits_extload_known01) {
+  SDLoc Loc;
+  auto Int8VT = EVT::getIntegerVT(Context, 8);
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto Int64VT = EVT::getIntegerVT(Context, 64);
+  auto Ptr = DAG->getConstant(0, Loc, Int64VT);
+  auto PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG->getMachineFunction(), 0);
+  AAMDNodes AA;
+  MDBuilder MDHelper(*DAG->getContext());
+  MDNode *Range = MDHelper.createRange(APInt(8, 0), APInt(8, 2));
+  MachineMemOperand *MMO = DAG->getMachineFunction().getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, 8, Align(8), AA, Range);
+
+  auto ALoad = DAG->getExtLoad(ISD::EXTLOAD, Loc, Int32VT, DAG->getEntryNode(),
+                               Ptr, Int8VT, MMO);
+  KnownBits Known = DAG->computeKnownBits(ALoad);
+  EXPECT_EQ(Known.Zero, APInt(32, 0xfe));
+  EXPECT_EQ(Known.One, APInt(32, 0));
+
+  auto ZLoad = DAG->getExtLoad(ISD::ZEXTLOAD, Loc, Int32VT, DAG->getEntryNode(),
+                               Ptr, Int8VT, MMO);
+  Known = DAG->computeKnownBits(ZLoad);
+  EXPECT_EQ(Known.Zero, APInt(32, 0xfffffffe));
+  EXPECT_EQ(Known.One, APInt(32, 0));
+
+  auto SLoad = DAG->getExtLoad(ISD::SEXTLOAD, Loc, Int32VT, DAG->getEntryNode(),
+                               Ptr, Int8VT, MMO);
+  Known = DAG->computeKnownBits(SLoad);
+  EXPECT_EQ(Known.Zero, APInt(32, 0xfffffffe));
+  EXPECT_EQ(Known.One, APInt(32, 0));
+}
+
+TEST_F(AArch64SelectionDAGTest, computeKnownBits_extload_knownnegative) {
+  SDLoc Loc;
+  auto Int8VT = EVT::getIntegerVT(Context, 8);
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto Int64VT = EVT::getIntegerVT(Context, 64);
+  auto Ptr = DAG->getConstant(0, Loc, Int64VT);
+  auto PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG->getMachineFunction(), 0);
+  AAMDNodes AA;
+  MDBuilder MDHelper(*DAG->getContext());
+  MDNode *Range = MDHelper.createRange(APInt(8, 0xf0), APInt(8, 0xff));
+  MachineMemOperand *MMO = DAG->getMachineFunction().getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, 8, Align(8), AA, Range);
+
+  auto ALoad = DAG->getExtLoad(ISD::EXTLOAD, Loc, Int32VT, DAG->getEntryNode(),
+                               Ptr, Int8VT, MMO);
+  KnownBits Known = DAG->computeKnownBits(ALoad);
+  EXPECT_EQ(Known.Zero, APInt(32, 0));
+  EXPECT_EQ(Known.One, APInt(32, 0xf0));
+
+  auto ZLoad = DAG->getExtLoad(ISD::ZEXTLOAD, Loc, Int32VT, DAG->getEntryNode(),
+                               Ptr, Int8VT, MMO);
+  Known = DAG->computeKnownBits(ZLoad);
+  EXPECT_EQ(Known.Zero, APInt(32, 0xffffff00));
+  EXPECT_EQ(Known.One, APInt(32, 0x000000f0));
+
+  auto SLoad = DAG->getExtLoad(ISD::SEXTLOAD, Loc, Int32VT, DAG->getEntryNode(),
+                               Ptr, Int8VT, MMO);
+  Known = DAG->computeKnownBits(SLoad);
+  EXPECT_EQ(Known.Zero, APInt(32, 0));
+  EXPECT_EQ(Known.One, APInt(32, 0xfffffff0));
+}
+
 } // end namespace llvm

From 6c2eec5ceadf26ce8d732d718a8906d075a7d6c7 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Thu, 29 Feb 2024 13:57:59 +0100
Subject: [PATCH 11/55] AMDGPU/GlobalISel: lane masks merging  (#73337)

Basic implementation of lane mask merging for GlobalISel.
Lane masks on GlobalISel are registers with sgpr register class
and S1 LLT - required by machine uniformity analysis.
Implements equivalent of lowerPhis from SILowerI1Copies.cpp in:
patch 1: https://github.com/llvm/llvm-project/pull/75340
patch 2: https://github.com/llvm/llvm-project/pull/75349
patch 3: https://github.com/llvm/llvm-project/pull/80003
patch 4: https://github.com/llvm/llvm-project/pull/78431
patch 5: is in this commit:

AMDGPU/GlobalISelDivergenceLowering: constrain incoming registers

Previously, in PHIs that represent lane masks, incoming registers
taken as-is were not selected as lane masks. Such registers are not
being merged with another lane mask and most often only have S1 LLT.
Implement constrainAsLaneMask by constraining incoming registers
taken as-is with lane mask attributes, essentially transforming them
to lane masks. This is final step in having PHI instructions created
in this pass to be fully instruction-selected.
---
 .../AMDGPUGlobalISelDivergenceLowering.cpp    |  11 +-
 ...-divergent-i1-phis-no-lane-mask-merging.ll |  24 +--
 ...divergent-i1-phis-no-lane-mask-merging.mir |  66 ++++---
 ...vergence-divergent-i1-used-outside-loop.ll |  36 ++--
 ...ergence-divergent-i1-used-outside-loop.mir | 177 +++++++++---------
 .../GlobalISel/divergence-structurizer.ll     |  13 +-
 .../GlobalISel/divergence-structurizer.mir    | 161 ++++++++--------
 7 files changed, 256 insertions(+), 232 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index 4f65a95de82ac8..a0c6bf7cc31c0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -177,7 +177,16 @@ void DivergenceLoweringHelper::buildMergeLaneMasks(
   B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
 }
 
-void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
+// GlobalISel has to constrain S1 incoming taken as-is with lane mask register
+// class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
+// Incoming.Reg becomes that new lane mask.
+void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
+  B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
+
+  auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
+  MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
+  In.Reg = Copy.getReg(0);
+}
 
 } // End anonymous namespace.
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 0f70c1996d6e02..d4d5cb18bbd30e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 
 ; Divergent phis that don't require lowering using lane mask merging
 
@@ -147,32 +147,28 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
 ; GFX10-LABEL: divergent_i1_phi_used_inside_loop_bigger_loop_body:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, 1.0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3e8
-; GFX10-NEXT:    v_mov_b32_e32 v8, s5
+; GFX10-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX10-NEXT:    ; implicit-def: $sgpr6
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_branch .LBB3_2
 ; GFX10-NEXT:  .LBB3_1: ; %loop_body
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v9, v8
-; GFX10-NEXT:    s_xor_b32 s4, s4, -1
+; GFX10-NEXT:    s_xor_b32 s5, s5, -1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
-; GFX10-NEXT:    s_and_b32 s4, exec_lo, s4
-; GFX10-NEXT:    s_or_b32 s6, s6, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_and_b32 s7, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  .LBB3_2: ; %loop_start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GFX10-NEXT:    v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
 ; GFX10-NEXT:    s_mov_b32 s7, 1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v9
 ; GFX10-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX10-NEXT:  ; %bb.3: ; %else
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
@@ -189,7 +185,7 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
 ; GFX10-NEXT:    flat_store_dword v[4:5], v1
 ; GFX10-NEXT:    s_branch .LBB3_1
 ; GFX10-NEXT:  .LBB3_6: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[2:3], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index 5549c89dc402f8..9b0bd2752b8231 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -33,6 +33,7 @@ body: |
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -46,7 +47,8 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.0, %20(s1), %bb.3
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -54,12 +56,13 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C3]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -126,9 +129,10 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[COPY4]](s1)
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -137,17 +141,17 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C4]], [[C3]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -292,19 +296,21 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %39(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
-  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C3]]
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI3]](s32), [[C3]]
   ; GFX10-NEXT:   G_BRCOND [[ICMP]](s1), %bb.4
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -336,26 +342,27 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C8]]
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
+  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[COPY10]], [[C8]]
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C9]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY10]](s1), [[C11]], [[C10]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY13]](s1), [[C11]], [[C10]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -475,6 +482,7 @@ body: |
   ; GFX10-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[AND1]](s32)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC1]], [[C5]]
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1)
   ; GFX10-NEXT:   G_BRCOND [[XOR]](s1), %bb.2
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -487,9 +495,10 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
-  ; GFX10-NEXT:   G_BRCOND [[PHI1]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %58(s1), %bb.4
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   G_BRCOND [[COPY4]](s1), %bb.5
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -517,6 +526,7 @@ body: |
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[PHI5]](s32), [[AMDGPU_BUFFER_LOAD]]
   ; GFX10-NEXT:   [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
   ; GFX10-NEXT:   [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
@@ -527,7 +537,7 @@ body: |
   ; GFX10-NEXT:   [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[C11]]
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI]](s32), %bb.2, [[OR2]](s32), %bb.5
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI1]](s32), %bb.2, [[OR2]](s32), %bb.5
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[LOAD]](<8 x s32>)
   ; GFX10-NEXT:   [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY1]]
   ; GFX10-NEXT:   [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index e9df20f9688e6d..49c232661c6dc1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 
 ; This file contains various tests that have divergent i1s used outside of
 ; the loop. These are lane masks is sgpr and need to have correct value in
@@ -137,28 +137,24 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
 ; GFX10-LABEL: divergent_i1_xor_used_outside_loop:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, 1.0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-NEXT:    ; implicit-def: $sgpr6
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX10-NEXT:  .LBB2_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v1
+; GFX10-NEXT:    s_xor_b32 s5, s5, -1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v5, v0
-; GFX10-NEXT:    s_xor_b32 s7, vcc_lo, -1
-; GFX10-NEXT:    s_or_b32 s5, s4, s5
-; GFX10-NEXT:    v_mov_b32_e32 v4, s7
-; GFX10-NEXT:    s_andn2_b32 s4, s6, exec_lo
-; GFX10-NEXT:    s_and_b32 s6, exec_lo, s7
-; GFX10-NEXT:    s_or_b32 s6, s4, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v0
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s7, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[2:3], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -197,7 +193,7 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, 1
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  ; %bb.1: ; %loop.start.preheader
@@ -332,7 +328,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, v1, v4
-; GFX10-NEXT:    s_mov_b32 s7, 1
+; GFX10-NEXT:    s_mov_b32 s7, -1
 ; GFX10-NEXT:    ; implicit-def: $vgpr5
 ; GFX10-NEXT:    s_and_saveexec_b32 s8, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_1
@@ -410,7 +406,7 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
 ; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    s_mov_b32 s3, 1
+; GFX10-NEXT:    s_mov_b32 s3, -1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    ; implicit-def: $sgpr1
 ; GFX10-NEXT:    ; implicit-def: $sgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index ace9bec6e1c2c8..206c0adb6c0c1f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -175,14 +175,15 @@ body: |
   ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.1
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY14]](s1), [[C7]], [[C6]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[C7]], [[C6]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -255,37 +256,40 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %27(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %24(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY6]], [[C2]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -349,7 +353,8 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY5]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -365,26 +370,26 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %40(s1), %bb.8
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY6]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %61(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %48(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %73(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %62(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %49(s1), %bb.7
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI5]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
@@ -392,14 +397,14 @@ body: |
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C5]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -407,16 +412,16 @@ body: |
   ; GFX10-NEXT:   successors: %bb.7(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1)
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI5]], [[C7]]
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI5]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY14]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
@@ -436,15 +441,15 @@ body: |
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY17]], [[C9]]
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY16]](s1), [[PHI4]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY18]], [[C9]]
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY17]](s1), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY19]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
@@ -453,11 +458,11 @@ body: |
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
-  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1)
-  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY19]](s1)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY20]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
@@ -574,7 +579,7 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %39(s1), %bb.6
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
   ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
@@ -600,9 +605,10 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI2]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[COPY8]](s1)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -610,21 +616,21 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C4]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
@@ -636,9 +642,9 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
-  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY11]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY12]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
@@ -751,26 +757,27 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.3
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %54(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %43(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %33(s1), %bb.3
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]](s1)
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[COPY8]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY8]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
@@ -783,10 +790,10 @@ body: |
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]]
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -794,32 +801,32 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[PHI2]](s1), %bb.1, [[DEF2]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY11]]
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1)
+  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY12]]
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C4]]
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI4]](s32), [[COPY]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[C6]], [[C5]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY16]](s1), [[C6]], [[C5]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index 609fff51863a03..1698f84eea5185 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 
 ; Simples case, if - then, that requires lane mask merging,
 ; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
@@ -43,13 +43,12 @@ define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid,
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s2, 0, s0
-; GFX10-NEXT:    ; implicit-def: $sgpr0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX10-NEXT:  ; %bb.1: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
-; GFX10-NEXT:    s_andn2_b32 s0, s2, exec_lo
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-NEXT:    ; implicit-def: $vgpr2
 ; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
@@ -211,7 +210,7 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v4, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s4, -1
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
@@ -308,7 +307,7 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v4, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s4, -1
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
@@ -318,7 +317,7 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v6, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, -1
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index df5505e1b28bbc..8197b072c740b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -18,9 +18,10 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[COPY4]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -29,18 +30,18 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C4]], [[C3]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -91,18 +92,20 @@ body: |
   ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[COPY4]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[COPY5]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %20(s1), %bb.3
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[COPY6]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
   ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -111,9 +114,9 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C1]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -122,19 +125,19 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[C3]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C3]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -368,13 +371,14 @@ body: |
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
   ; GFX10-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C6]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[COPY9]](s1)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -383,9 +387,9 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
@@ -402,21 +406,21 @@ body: |
   ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C8]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]]
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY9]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[COPY12]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY13]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -560,13 +564,14 @@ body: |
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
   ; GFX10-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C6]]
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[COPY11]](s1)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -575,9 +580,9 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
@@ -585,26 +590,27 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1)
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C8]](s32)
   ; GFX10-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV3]], [[SHL2]](s64)
   ; GFX10-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD2]](s32), [[C9]]
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
   ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP2]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY11]](s1), %bb.2, %72(s1), %bb.7
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[COPY16]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -621,21 +627,21 @@ body: |
   ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C11]]
   ; GFX10-NEXT:   [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
   ; GFX10-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C12]]
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP3]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP3]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY15]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[COPY14]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[COPY17]](s1)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[COPY19]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -970,6 +976,7 @@ body: |
   ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
@@ -982,19 +989,19 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %67(s1), %bb.6, %70(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %67(s1), %bb.6, %71(s1), %bb.7
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %49(s1), %bb.6, %48(s1), %bb.7
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI %35(s1), %bb.6, %34(s1), %bb.7
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY9]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY10]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), %17(s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), %17(s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -1011,28 +1018,28 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_CONVERGENT]](s32)
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]]
   ; GFX10-NEXT:   [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32)
   ; GFX10-NEXT:   [[DEF4:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[DEF5:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %63(s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT2]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY14]](s1), [[COPY3]], [[COPY2]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[COPY3]], [[COPY2]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32)
   ; GFX10-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0
@@ -1042,14 +1049,14 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT1]](s32), %bb.3
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %42(s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %56(s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[DEF6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.2
@@ -1057,27 +1064,27 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY7]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, [[PHI7]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF5]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI2]](s1), %bb.2, [[DEF4]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
-  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]](s1)
-  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]](s1)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]](s1)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY20]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY21]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_5]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_6:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY19]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_6:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_5]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_6:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY20]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_6:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY22]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_6:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_6]](s1), [[S_AND_B32_6]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_6]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY17]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_6]](s1)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   bb.0:
     successors: %bb.7(0x80000000)

From 7e88d5176060fa4d86376460db8310083b7c0e2d Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 29 Feb 2024 13:00:29 +0000
Subject: [PATCH 12/55] [NFC][RemoveDIs] Have CreateNeg only accept iterators
 (#82999)

Removing debug-intrinsics requires that we always insert with an
iterator, not with an instruction position. To enforce that, we need to
eliminate the `Instruction *` taking functions. It's safe to leave the
insert-at-end-of-block functions as the intention is clear for debug
info purposes (i.e., insert after both instructions and debug-info at
the end of the function).

This patch demonstrates how that needs to happen. At a variety of
call-sites to the `CreateNeg` constructor we need to consider:
* Has this instruction been selected because of the operation it
performs? In that case, just call `getIterator` and pass an iterator in.
* Has this instruction been selected because of it's position? If so, we
need to keep the iterator identifying that position (see the 3rd hunk
changing Reassociate.cpp, although it's coincidentally not debug-info
significant).

This also demonstrates what we'll try and do with the constructor
methods going forwards: have one fully explicit set of parameters
including iterator, and another with default-arguments where the
block-to-insert-into argument defaults to nullptr / no-position,
creating an instruction that hasn't been inserted yet.
---
 llvm/include/llvm/IR/InstrTypes.h                  |  4 +---
 llvm/lib/IR/Instruction.cpp                        |  9 ++++-----
 llvm/lib/IR/Instructions.cpp                       |  8 --------
 .../Scalar/CorrelatedValuePropagation.cpp          | 14 ++++++++------
 llvm/lib/Transforms/Scalar/Reassociate.cpp         |  8 +++++---
 5 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 4ee51cd192ed7c..a124fbb88e3bcd 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -468,9 +468,7 @@ class BinaryOperator : public Instruction {
   static BinaryOperator *CreateNeg(Value *Op, const Twine &Name,
                                    BasicBlock::iterator InsertBefore);
   static BinaryOperator *CreateNeg(Value *Op, const Twine &Name = "",
-                                   Instruction *InsertBefore = nullptr);
-  static BinaryOperator *CreateNeg(Value *Op, const Twine &Name,
-                                   BasicBlock *InsertAtEnd);
+                                   BasicBlock *InsertAtEnd = nullptr);
   static BinaryOperator *CreateNSWNeg(Value *Op, const Twine &Name,
                                       BasicBlock::iterator InsertBefore);
   static BinaryOperator *CreateNSWNeg(Value *Op, const Twine &Name = "",
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index c54f8d7aca4a96..ce221758ef798b 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -46,11 +46,11 @@ Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
 
 Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
                          BasicBlock *InsertAtEnd)
-  : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(nullptr) {
+    : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(nullptr) {
 
-  // append this instruction into the basic block
-  assert(InsertAtEnd && "Basic block to append to may not be NULL!");
-  insertInto(InsertAtEnd, InsertAtEnd->end());
+  // If requested, append this instruction into the basic block.
+  if (InsertAtEnd)
+    insertInto(InsertAtEnd, InsertAtEnd->end());
 }
 
 Instruction::~Instruction() {
@@ -73,7 +73,6 @@ Instruction::~Instruction() {
   setMetadata(LLVMContext::MD_DIAssignID, nullptr);
 }
 
-
 void Instruction::setParent(BasicBlock *P) {
   Parent = P;
 }
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 25778570ebf34a..976015656a5f96 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3245,14 +3245,6 @@ BinaryOperator *BinaryOperator::CreateNeg(Value *Op, const Twine &Name,
                             InsertBefore);
 }
 
-BinaryOperator *BinaryOperator::CreateNeg(Value *Op, const Twine &Name,
-                                          Instruction *InsertBefore) {
-  Value *Zero = ConstantInt::get(Op->getType(), 0);
-  return new BinaryOperator(Instruction::Sub,
-                            Zero, Op,
-                            Op->getType(), Name, InsertBefore);
-}
-
 BinaryOperator *BinaryOperator::CreateNeg(Value *Op, const Twine &Name,
                                           BasicBlock *InsertAtEnd) {
   Value *Zero = ConstantInt::get(Op->getType(), 0);
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 6ce9eb3656c93a..490cb7e528eb6f 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -905,8 +905,8 @@ static bool processSRem(BinaryOperator *SDI, const ConstantRange &LCR,
   for (Operand &Op : Ops) {
     if (Op.D == Domain::NonNegative)
       continue;
-    auto *BO =
-        BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg", SDI);
+    auto *BO = BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg",
+                                         SDI->getIterator());
     BO->setDebugLoc(SDI->getDebugLoc());
     Op.V = BO;
   }
@@ -919,7 +919,8 @@ static bool processSRem(BinaryOperator *SDI, const ConstantRange &LCR,
 
   // If the divident was non-positive, we need to negate the result.
   if (Ops[0].D == Domain::NonPositive) {
-    Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
+    Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg",
+                                    SDI->getIterator());
     Res->setDebugLoc(SDI->getDebugLoc());
   }
 
@@ -966,8 +967,8 @@ static bool processSDiv(BinaryOperator *SDI, const ConstantRange &LCR,
   for (Operand &Op : Ops) {
     if (Op.D == Domain::NonNegative)
       continue;
-    auto *BO =
-        BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg", SDI);
+    auto *BO = BinaryOperator::CreateNeg(Op.V, Op.V->getName() + ".nonneg",
+                                         SDI->getIterator());
     BO->setDebugLoc(SDI->getDebugLoc());
     Op.V = BO;
   }
@@ -981,7 +982,8 @@ static bool processSDiv(BinaryOperator *SDI, const ConstantRange &LCR,
 
   // If the operands had two different domains, we need to negate the result.
   if (Ops[0].D != Ops[1].D) {
-    Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
+    Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg",
+                                    SDI->getIterator());
     Res->setDebugLoc(SDI->getDebugLoc());
   }
 
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 818c7b40d489ef..61109ed3765987 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -270,7 +270,8 @@ static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
 }
 
 static Instruction *CreateNeg(Value *S1, const Twine &Name,
-                              Instruction *InsertBefore, Value *FlagsOp) {
+                              BasicBlock::iterator InsertBefore,
+                              Value *FlagsOp) {
   if (S1->getType()->isIntOrIntVectorTy())
     return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
 
@@ -958,7 +959,8 @@ static Value *NegateValue(Value *V, Instruction *BI,
 
   // Insert a 'neg' instruction that subtracts the value from zero to get the
   // negation.
-  Instruction *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
+  Instruction *NewNeg =
+      CreateNeg(V, V->getName() + ".neg", BI->getIterator(), BI);
   ToRedo.insert(NewNeg);
   return NewNeg;
 }
@@ -1246,7 +1248,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
   }
 
   if (NeedsNegate)
-    V = CreateNeg(V, "neg", &*InsertPt, BO);
+    V = CreateNeg(V, "neg", InsertPt, BO);
 
   return V;
 }

From 780d55690ec5d8cd5c0e5fa4af799155d5d00534 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 29 Feb 2024 13:00:37 +0000
Subject: [PATCH 13/55] [mlir][ArmSVE] Fix test-setArmVLBits.mlir after #83213

---
 .../CPU/ArmSVE/Emulated/test-setArmVLBits.mlir  | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/test-setArmVLBits.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/test-setArmVLBits.mlir
index 4f46c6e1ebf6a8..aa8d0e4d5104ab 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/test-setArmVLBits.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/test-setArmVLBits.mlir
@@ -8,7 +8,7 @@
 
 func.func @checkVScale() {
   %vscale = vector.vscale
-  vector.print str "vscale"
+  vector.print str "vscale = "
   vector.print %vscale : index
   return
 }
@@ -20,28 +20,23 @@ func.func @setAndCheckVL(%bits: i32) {
 }
 
 func.func @main() {
-  //      CHECK: vscale
-  // CHECK-NEXT: 1
+  //      CHECK: vscale = 1
   %c128 = arith.constant 128 : i32
   func.call @setAndCheckVL(%c128) : (i32) -> ()
 
-  //      CHECK: vscale
-  // CHECK-NEXT: 2
+  //      CHECK: vscale = 2
   %c256 = arith.constant 256 : i32
   func.call @setAndCheckVL(%c256) : (i32) -> ()
 
-  //      CHECK: vscale
-  // CHECK-NEXT: 4
+  //      CHECK: vscale = 4
   %c512 = arith.constant 512 : i32
   func.call @setAndCheckVL(%c512) : (i32) -> ()
 
-  //      CHECK: vscale
-  // CHECK-NEXT: 8
+  //      CHECK: vscale = 8
   %c1024 = arith.constant 1024 : i32
   func.call @setAndCheckVL(%c1024) : (i32) -> ()
 
-  //      CHECK: vscale
-  // CHECK-NEXT: 16
+  //      CHECK: vscale = 16
   %c2048 = arith.constant 2048 : i32
   func.call @setAndCheckVL(%c2048) : (i32) -> ()
 

From e08fe575d5953b6ca0d7a578c1afa00247f0a12f Mon Sep 17 00:00:00 2001
From: Dani <daniel.kiss@arm.com>
Date: Thu, 29 Feb 2024 14:05:37 +0100
Subject: [PATCH 14/55] [NFC][ARM][AArch64] Deduplicated code. (#82785)

Add the SignReturnAddressScopeKind to the BranchProtectionInfo class.
---
 clang/include/clang/Basic/TargetInfo.h | 36 +++++++++++++++++++++-----
 clang/lib/CodeGen/Targets/AArch64.cpp  |  3 +--
 clang/lib/CodeGen/Targets/ARM.cpp      |  8 +-----
 3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 48e9cec482755c..b94d13609c3dd2 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1369,13 +1369,35 @@ class TargetInfo : public TransferrableTargetInfo,
   }
 
   struct BranchProtectionInfo {
-    LangOptions::SignReturnAddressScopeKind SignReturnAddr =
-        LangOptions::SignReturnAddressScopeKind::None;
-    LangOptions::SignReturnAddressKeyKind SignKey =
-        LangOptions::SignReturnAddressKeyKind::AKey;
-    bool BranchTargetEnforcement = false;
-    bool BranchProtectionPAuthLR = false;
-    bool GuardedControlStack = false;
+    LangOptions::SignReturnAddressScopeKind SignReturnAddr;
+    LangOptions::SignReturnAddressKeyKind SignKey;
+    bool BranchTargetEnforcement;
+    bool BranchProtectionPAuthLR;
+    bool GuardedControlStack;
+
+    BranchProtectionInfo() = default;
+
+    const char *getSignReturnAddrStr() const {
+      switch (SignReturnAddr) {
+      case LangOptions::SignReturnAddressScopeKind::None:
+        return "none";
+      case LangOptions::SignReturnAddressScopeKind::NonLeaf:
+        return "non-leaf";
+      case LangOptions::SignReturnAddressScopeKind::All:
+        return "all";
+      }
+      assert(false && "Unexpected SignReturnAddressScopeKind");
+    }
+
+    const char *getSignKeyStr() const {
+      switch (SignKey) {
+      case LangOptions::SignReturnAddressKeyKind::AKey:
+        return "a_key";
+      case LangOptions::SignReturnAddressKeyKind::BKey:
+        return "b_key";
+      }
+      assert(false && "Unexpected SignReturnAddressKeyKind");
+    }
   };
 
   /// Determine if the Architecture in this TargetInfo supports branch
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index adfdd516351901..2b8e2aeb4265f3 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -132,8 +132,7 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
     assert(Error.empty());
 
     auto *Fn = cast<llvm::Function>(GV);
-    static const char *SignReturnAddrStr[] = {"none", "non-leaf", "all"};
-    Fn->addFnAttr("sign-return-address", SignReturnAddrStr[static_cast<int>(BPI.SignReturnAddr)]);
+    Fn->addFnAttr("sign-return-address", BPI.getSignReturnAddrStr());
 
     if (BPI.SignReturnAddr != LangOptions::SignReturnAddressScopeKind::None) {
       Fn->addFnAttr("sign-return-address-key",
diff --git a/clang/lib/CodeGen/Targets/ARM.cpp b/clang/lib/CodeGen/Targets/ARM.cpp
index d7d175ff1724f7..5d42e6286e525b 100644
--- a/clang/lib/CodeGen/Targets/ARM.cpp
+++ b/clang/lib/CodeGen/Targets/ARM.cpp
@@ -152,13 +152,7 @@ class ARMTargetCodeGenInfo : public TargetCodeGenInfo {
               diag::warn_target_unsupported_branch_protection_attribute)
               << Arch;
         } else {
-          static const char *SignReturnAddrStr[] = {"none", "non-leaf", "all"};
-          assert(static_cast<unsigned>(BPI.SignReturnAddr) <= 2 &&
-                 "Unexpected SignReturnAddressScopeKind");
-          Fn->addFnAttr(
-              "sign-return-address",
-              SignReturnAddrStr[static_cast<int>(BPI.SignReturnAddr)]);
-
+          Fn->addFnAttr("sign-return-address", BPI.getSignReturnAddrStr());
           Fn->addFnAttr("branch-target-enforcement",
                         BPI.BranchTargetEnforcement ? "true" : "false");
         }

From 86f4b4dfde543287c1b29ecae27cc1bee470eebb Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Thu, 29 Feb 2024 06:06:57 -0800
Subject: [PATCH 15/55] [OpenACC] Implement Compute Construct 'goto' in/out
 logic (#83326)

Compute Constructs do not permit jumping in/out of them, so this patch
implements this for 'goto' as a followup to the other patches that have
done the same thing.

It does this by modifying the JumpDiagnostics to work with this, plus
setting the function to needing jump diagnostics if we discover a goto
or label inside of a Compute Construct.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |   4 +
 clang/lib/Sema/JumpDiagnostics.cpp            |  19 +-
 clang/lib/Sema/SemaStmt.cpp                   |  16 ++
 clang/test/SemaOpenACC/no-branch-in-out.c     | 197 ++++++++++++++++++
 4 files changed, 234 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f726805dc02bd9..378b537e029710 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12214,4 +12214,8 @@ def err_acc_construct_appertainment
 def err_acc_branch_in_out_compute_construct
     : Error<"invalid %select{branch|return}0 %select{out of|into}1 OpenACC "
             "Compute Construct">;
+def note_acc_branch_into_compute_construct
+    : Note<"invalid branch into OpenACC Compute Construct">;
+def note_acc_branch_out_of_compute_construct
+    : Note<"invalid branch out of OpenACC Compute Construct">;
 } // end of sema component.
diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp
index ec3892e92f3c3b..6722878883be8e 100644
--- a/clang/lib/Sema/JumpDiagnostics.cpp
+++ b/clang/lib/Sema/JumpDiagnostics.cpp
@@ -604,6 +604,16 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,
     break;
   }
 
+  case Stmt::OpenACCComputeConstructClass: {
+    unsigned NewParentScope = Scopes.size();
+    OpenACCComputeConstruct *CC = cast<OpenACCComputeConstruct>(S);
+    Scopes.push_back(GotoScope(
+        ParentScope, diag::note_acc_branch_into_compute_construct,
+        diag::note_acc_branch_out_of_compute_construct, CC->getBeginLoc()));
+    BuildScopeInformation(CC->getStructuredBlock(), NewParentScope);
+    return;
+  }
+
   default:
     if (auto *ED = dyn_cast<OMPExecutableDirective>(S)) {
       if (!ED->isStandaloneDirective()) {
@@ -936,11 +946,16 @@ void JumpScopeChecker::CheckJump(Stmt *From, Stmt *To, SourceLocation DiagLoc,
       if (Scopes[I].InDiag == diag::note_protected_by_seh_finally) {
         S.Diag(From->getBeginLoc(), diag::warn_jump_out_of_seh_finally);
         break;
-      }
-      if (Scopes[I].InDiag == diag::note_omp_protected_structured_block) {
+      } else if (Scopes[I].InDiag ==
+                 diag::note_omp_protected_structured_block) {
         S.Diag(From->getBeginLoc(), diag::err_goto_into_protected_scope);
         S.Diag(To->getBeginLoc(), diag::note_omp_exits_structured_block);
         break;
+      } else if (Scopes[I].InDiag ==
+                 diag::note_acc_branch_into_compute_construct) {
+        S.Diag(From->getBeginLoc(), diag::err_goto_into_protected_scope);
+        S.Diag(Scopes[I].Loc, diag::note_acc_branch_out_of_compute_construct);
+        return;
       }
     }
   }
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 0a5c2b23a90c8e..ca2d206752744c 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -567,6 +567,11 @@ Sema::ActOnLabelStmt(SourceLocation IdentLoc, LabelDecl *TheDecl,
     Diag(IdentLoc, diag::warn_reserved_extern_symbol)
         << TheDecl << static_cast<int>(Status);
 
+  // If this label is in a compute construct scope, we need to make sure we
+  // check gotos in/out.
+  if (getCurScope()->isInOpenACCComputeConstructScope())
+    setFunctionHasBranchProtectedScope();
+
   // Otherwise, things are good.  Fill in the declaration and return it.
   LabelStmt *LS = new (Context) LabelStmt(IdentLoc, TheDecl, SubStmt);
   TheDecl->setStmt(LS);
@@ -3304,6 +3309,12 @@ StmtResult Sema::ActOnGotoStmt(SourceLocation GotoLoc,
                                SourceLocation LabelLoc,
                                LabelDecl *TheDecl) {
   setFunctionHasBranchIntoScope();
+
+  // If this goto is in a compute construct scope, we need to make sure we check
+  // gotos in/out.
+  if (getCurScope()->isInOpenACCComputeConstructScope())
+    setFunctionHasBranchProtectedScope();
+
   TheDecl->markUsed(Context);
   return new (Context) GotoStmt(TheDecl, GotoLoc, LabelLoc);
 }
@@ -3332,6 +3343,11 @@ Sema::ActOnIndirectGotoStmt(SourceLocation GotoLoc, SourceLocation StarLoc,
 
   setFunctionHasIndirectGoto();
 
+  // If this goto is in a compute construct scope, we need to make sure we
+  // check gotos in/out.
+  if (getCurScope()->isInOpenACCComputeConstructScope())
+    setFunctionHasBranchProtectedScope();
+
   return new (Context) IndirectGotoStmt(GotoLoc, StarLoc, E);
 }
 
diff --git a/clang/test/SemaOpenACC/no-branch-in-out.c b/clang/test/SemaOpenACC/no-branch-in-out.c
index f8fb40a1ca8f72..d070247fa65b86 100644
--- a/clang/test/SemaOpenACC/no-branch-in-out.c
+++ b/clang/test/SemaOpenACC/no-branch-in-out.c
@@ -113,3 +113,200 @@ void Return() {
     }
   }
 }
+
+void Goto() {
+  int j;
+#pragma acc parallel // expected-note{{invalid branch out of OpenACC Compute Construct}}
+  while(j) {
+    if (j <3)
+      goto LABEL; // expected-error{{cannot jump from this goto statement to its label}}
+  }
+
+LABEL:
+  {}
+
+  goto LABEL_IN; // expected-error{{cannot jump from this goto statement to its label}}
+
+#pragma acc parallel // expected-note{{invalid branch into OpenACC Compute Construct}}
+  for(int i = 0; i < 5; ++i) {
+LABEL_IN:
+    {}
+  }
+
+#pragma acc parallel
+  for(int i = 0; i < 5; ++i) {
+LABEL_NOT_CALLED:
+    {}
+  }
+
+#pragma acc parallel
+  {
+    goto ANOTHER_LOOP; // expected-error{{cannot jump from this goto statement to its label}}
+
+  }
+#pragma acc parallel// expected-note{{invalid branch into OpenACC Compute Construct}}
+
+  {
+ANOTHER_LOOP:
+    {}
+  }
+
+#pragma acc parallel
+  {
+  while (j) {
+    --j;
+    if (j < 3)
+      goto LABEL2;
+
+    if (j > 4)
+      break;
+  }
+LABEL2:
+  {}
+  }
+
+#pragma acc parallel
+  do {
+    if (j < 3)
+      goto LABEL3;
+
+    if (j > 4)
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
+
+LABEL3:
+  {}
+  } while (j);
+
+LABEL4:
+  {}
+#pragma acc parallel// expected-note{{invalid branch out of OpenACC Compute Construct}}
+  {
+    goto LABEL4;// expected-error{{cannot jump from this goto statement to its label}}
+  }
+
+#pragma acc parallel// expected-note{{invalid branch into OpenACC Compute Construct}}
+
+  {
+LABEL5:
+    {}
+  }
+
+  {
+    goto LABEL5;// expected-error{{cannot jump from this goto statement to its label}}
+  }
+
+#pragma acc parallel
+  {
+LABEL6:
+    {}
+    goto LABEL6;
+
+  }
+
+#pragma acc parallel
+  goto LABEL7; // expected-error{{cannot jump from this goto statement to its label}}
+#pragma acc parallel// expected-note{{invalid branch into OpenACC Compute Construct}}
+  {
+LABEL7:{}
+  }
+
+#pragma acc parallel
+  LABEL8:{}
+#pragma acc parallel// expected-note{{invalid branch out of OpenACC Compute Construct}}
+  {
+    goto LABEL8;// expected-error{{cannot jump from this goto statement to its label}}
+  }
+
+
+#pragma acc parallel// expected-note{{invalid branch into OpenACC Compute Construct}}
+  {
+LABEL9:{}
+  }
+
+  ({goto LABEL9;});// expected-error{{cannot jump from this goto statement to its label}}
+
+#pragma acc parallel// expected-note{{invalid branch out of OpenACC Compute Construct}}
+  {
+  ({goto LABEL10;});// expected-error{{cannot jump from this goto statement to its label}}
+  }
+
+LABEL10:{}
+
+  ({goto LABEL11;});// expected-error{{cannot jump from this goto statement to its label}}
+#pragma acc parallel// expected-note{{invalid branch into OpenACC Compute Construct}}
+  {
+LABEL11:{}
+  }
+
+LABEL12:{}
+#pragma acc parallel// expected-note{{invalid branch out of OpenACC Compute Construct}}
+  {
+  ({goto LABEL12;});// expected-error{{cannot jump from this goto statement to its label}}
+  }
+
+#pragma acc parallel
+  {
+  ({goto LABEL13;});
+LABEL13:{}
+  }
+
+#pragma acc parallel
+  {
+  LABEL14:{}
+  ({goto LABEL14;});
+  }
+}
+
+void IndirectGoto1() {
+  void* ptr;
+#pragma acc parallel
+  {
+LABEL1:{}
+    ptr = &&LABEL1;
+
+    goto *ptr;
+
+  }
+}
+
+void IndirectGoto2() {
+  void* ptr;
+LABEL2:{} // #GOTOLBL2
+    ptr = &&LABEL2;
+#pragma acc parallel // #GOTOPAR2
+  {
+// expected-error@+3{{cannot jump from this indirect goto statement to one of its possible targets}}
+// expected-note@#GOTOLBL2{{possible target of indirect goto statement}}
+// expected-note@#GOTOPAR2{{invalid branch out of OpenACC Compute Construct}}
+    goto *ptr;
+  }
+}
+
+void IndirectGoto3() {
+  void* ptr;
+#pragma acc parallel // #GOTOPAR3
+  {
+LABEL3:{} // #GOTOLBL3
+    ptr = &&LABEL3;
+  }
+// expected-error@+3{{cannot jump from this indirect goto statement to one of its possible targets}}
+// expected-note@#GOTOLBL3{{possible target of indirect goto statement}}
+// expected-note@#GOTOPAR3{{invalid branch into OpenACC Compute Construct}}
+  goto *ptr;
+}
+
+void IndirectGoto4() {
+  void* ptr;
+#pragma acc parallel // #GOTOPAR4
+  {
+LABEL4:{}
+    ptr = &&LABEL4;
+// expected-error@+3{{cannot jump from this indirect goto statement to one of its possible targets}}
+// expected-note@#GOTOLBL5{{possible target of indirect goto statement}}
+// expected-note@#GOTOPAR4{{invalid branch out of OpenACC Compute Construct}}
+    goto *ptr;
+  }
+LABEL5:// #GOTOLBL5
+
+  ptr=&&LABEL5;
+}

From 6f7d824b804b272335d55f5b899295db833f3829 Mon Sep 17 00:00:00 2001
From: Vinayak Dev <104419489+vinayakdsci@users.noreply.github.com>
Date: Thu, 29 Feb 2024 19:39:39 +0530
Subject: [PATCH 16/55] [Clang][Sema]: Diagnose lambda to bool implicit casts
 (#83152)

Adds diagnostics for lambda expressions being cast to boolean values,
which results in the expression always evaluating to true.
Earlier, Clang allowed compilation of such erroneous programs, but now
emits a warning through `-Wpointer-bool-conversion`.

Fixes #82512
---
 clang/docs/ReleaseNotes.rst                          |  3 +++
 clang/include/clang/Basic/DiagnosticSemaKinds.td     |  4 ++--
 clang/lib/Sema/SemaChecking.cpp                      | 11 +++++++++++
 clang/test/CXX/drs/dr18xx.cpp                        |  1 +
 .../CXX/expr/expr.prim/expr.prim.lambda/blocks.mm    |  9 +++++----
 clang/test/SemaCXX/warn-bool-conversion.cpp          | 12 ++++++++++++
 6 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7e16b9f0c67dbd..a5c6b80c4e99e1 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -192,6 +192,9 @@ Improvements to Clang's diagnostics
 
 - Clang now diagnoses declarative nested name specifiers that name alias templates.
 
+- Clang now diagnoses lambda function expressions being implicitly cast to boolean values, under ``-Wpointer-bool-conversion``.
+  Fixes `#82512 <https://github.com/llvm/llvm-project/issues/82512>`_.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 378b537e029710..ff88c4f293abfd 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4127,8 +4127,8 @@ def ext_ms_impcast_fn_obj : ExtWarn<
   "Microsoft extension">, InGroup<MicrosoftCast>;
 
 def warn_impcast_pointer_to_bool : Warning<
-    "address of%select{| function| array}0 '%1' will always evaluate to "
-    "'true'">,
+    "address of %select{'%1'|function '%1'|array '%1'|lambda function pointer "
+    "conversion operator}0 will always evaluate to 'true'">,
     InGroup<PointerBoolConversion>;
 def warn_cast_nonnull_to_bool : Warning<
     "nonnull %select{function call|parameter}0 '%1' will evaluate to "
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 979b63884359fc..35d453e013e84b 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -16544,6 +16544,17 @@ void Sema::DiagnoseAlwaysNonNullPointer(Expr *E,
     }
   }
 
+  // Complain if we are converting a lambda expression to a boolean value
+  if (const auto *MCallExpr = dyn_cast<CXXMemberCallExpr>(E)) {
+    if (const auto *MRecordDecl = MCallExpr->getRecordDecl();
+        MRecordDecl && MRecordDecl->isLambda()) {
+      Diag(E->getExprLoc(), diag::warn_impcast_pointer_to_bool)
+          << /*LambdaPointerConversionOperatorType=*/3
+          << MRecordDecl->getSourceRange() << Range << IsEqual;
+      return;
+    }
+  }
+
   // Expect to find a single Decl.  Skip anything more complicated.
   ValueDecl *D = nullptr;
   if (DeclRefExpr *R = dyn_cast<DeclRefExpr>(E)) {
diff --git a/clang/test/CXX/drs/dr18xx.cpp b/clang/test/CXX/drs/dr18xx.cpp
index a7cee4ef8902f9..e78730e8992cf8 100644
--- a/clang/test/CXX/drs/dr18xx.cpp
+++ b/clang/test/CXX/drs/dr18xx.cpp
@@ -282,6 +282,7 @@ namespace dr1837 { // dr1837: 3.3
   struct A {
     int f();
     bool b = [] {
+      // since-cxx11-warning@-1 {{address of lambda function pointer conversion operator will always evaluate to 'true'}}
       struct Local {
         static_assert(sizeof(this->f()) == sizeof(int), "");
       };
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.lambda/blocks.mm b/clang/test/CXX/expr/expr.prim/expr.prim.lambda/blocks.mm
index cb56f6816ad036..e93c37f3b9ae12 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.lambda/blocks.mm
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.lambda/blocks.mm
@@ -65,10 +65,10 @@ void nesting() {
 
 namespace overloading {
   void bool_conversion() {
-    if ([](){}) {
+    if ([](){}) { // expected-warning{{address of lambda function pointer conversion operator will always evaluate to 'true'}}
     }
 
-    bool b = []{};
+    bool b = []{}; // expected-warning{{address of lambda function pointer conversion operator will always evaluate to 'true'}}
     b = (bool)[]{};
   }
 
@@ -108,8 +108,9 @@ void call_with_lambda() {
     using decltype(a)::operator id<void(*)()>; // expected-note {{here}}
   } extern d;
 
-  bool r1 = c;
-  bool r2 = d; // expected-error {{private}}
+  bool r1 = c; // expected-warning{{address of lambda function pointer conversion operator will always evaluate to 'true'}}
+  bool r2 = d; // expected-error {{private}} \
+                  expected-warning{{address of lambda function pointer conversion operator will always evaluate to 'true'}}
 }
 
 namespace PR13117 {
diff --git a/clang/test/SemaCXX/warn-bool-conversion.cpp b/clang/test/SemaCXX/warn-bool-conversion.cpp
index c81d52d864f2d2..9e8cf0e4f8944a 100644
--- a/clang/test/SemaCXX/warn-bool-conversion.cpp
+++ b/clang/test/SemaCXX/warn-bool-conversion.cpp
@@ -81,6 +81,18 @@ struct S2 {
 
 bool f5();
 bool f6(int);
+#if __cplusplus >= 201103L
+auto f7 = []{};
+auto f8 = [](){};
+
+void foo() {
+  bool b;
+  b = f7; // expected-warning {{address of lambda function pointer conversion operator will always evaluate to 'true'}}
+  b = f8; // expected-warning {{address of lambda function pointer conversion operator will always evaluate to 'true'}}
+  bool is_true = [](){ return true; };
+  // expected-warning@-1{{address of lambda function pointer conversion operator will always evaluate to 'true'}}
+}
+#endif
 
 void bar() {
   bool b;

From b1c8b9f89cac91db857b9123838ac6b6aeb0ae74 Mon Sep 17 00:00:00 2001
From: "S. Bharadwaj Yadavalli" <Bharadwaj.Yadavalli@microsoft.com>
Date: Thu, 29 Feb 2024 09:21:44 -0500
Subject: [PATCH 17/55] [DirectX][NFC] Leverage LLVM and DirectX intrinsic
 description in DXIL Op records (#83193)

* Leverage TableGen record descriptions of LLVM or DirectX intrinsics
that can be directly mapped in DXIL Ops TableGen description. As a
result, such DXIL Ops can be succinctly described without duplication.
DXILEmitter backend can derive the properties of DXIL Ops accordingly.
* Ensured that corresponding lit tests pass.
---
 llvm/lib/Target/DirectX/DXIL.td            | 347 +++++++++++-------
 llvm/lib/Target/DirectX/DXILOpBuilder.cpp  |  25 +-
 llvm/lib/Target/DirectX/DXILOpBuilder.h    |   3 +-
 llvm/lib/Target/DirectX/DXILOpLowering.cpp |   3 +-
 llvm/utils/TableGen/DXILEmitter.cpp        | 408 +++++++++------------
 5 files changed, 413 insertions(+), 373 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 8a3454c89542ce..67ef7986622092 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -12,139 +12,224 @@
 //===----------------------------------------------------------------------===//
 
 include "llvm/IR/Intrinsics.td"
-include "llvm/IR/Attributes.td"
 
-// Abstract representation of the class a DXIL Operation belongs to.
-class DXILOpClass<string name> {
-  string Name = name;
+class DXILOpClass;
+
+// Following is a set of DXIL Operation classes whose names appear to be
+// arbitrary, yet need to be a substring of the function name used during
+// lowering to DXIL Operation calls. These class name strings are specified
+// as the third argument of add_dixil_op in utils/hct/hctdb.py and case converted
+// in utils/hct/hctdb_instrhelp.py of DirectXShaderCompiler repo. The function
+// name has the format "dx.op.<class-name>.<return-type>".
+
+defset list<DXILOpClass> OpClasses = {
+  def acceptHitAndEndSearch : DXILOpClass;
+  def allocateNodeOutputRecords : DXILOpClass;
+  def allocateRayQuery : DXILOpClass;
+  def annotateHandle : DXILOpClass;
+  def annotateNodeHandle : DXILOpClass;
+  def annotateNodeRecordHandle : DXILOpClass;
+  def atomicBinOp : DXILOpClass;
+  def atomicCompareExchange : DXILOpClass;
+  def attributeAtVertex : DXILOpClass;
+  def barrier : DXILOpClass;
+  def barrierByMemoryHandle : DXILOpClass;
+  def barrierByMemoryType : DXILOpClass;
+  def barrierByNodeRecordHandle : DXILOpClass;
+  def binary : DXILOpClass;
+  def binaryWithCarryOrBorrow : DXILOpClass;
+  def binaryWithTwoOuts : DXILOpClass;
+  def bitcastF16toI16 : DXILOpClass;
+  def bitcastF32toI32 : DXILOpClass;
+  def bitcastF64toI64 : DXILOpClass;
+  def bitcastI16toF16 : DXILOpClass;
+  def bitcastI32toF32 : DXILOpClass;
+  def bitcastI64toF64 : DXILOpClass;
+  def bufferLoad : DXILOpClass;
+  def bufferStore : DXILOpClass;
+  def bufferUpdateCounter : DXILOpClass;
+  def calculateLOD : DXILOpClass;
+  def callShader : DXILOpClass;
+  def cbufferLoad : DXILOpClass;
+  def cbufferLoadLegacy : DXILOpClass;
+  def checkAccessFullyMapped : DXILOpClass;
+  def coverage : DXILOpClass;
+  def createHandle : DXILOpClass;
+  def createHandleForLib : DXILOpClass;
+  def createHandleFromBinding : DXILOpClass;
+  def createHandleFromHeap : DXILOpClass;
+  def createNodeInputRecordHandle : DXILOpClass;
+  def createNodeOutputHandle : DXILOpClass;
+  def cutStream : DXILOpClass;
+  def cycleCounterLegacy : DXILOpClass;
+  def discard : DXILOpClass;
+  def dispatchMesh : DXILOpClass;
+  def dispatchRaysDimensions : DXILOpClass;
+  def dispatchRaysIndex : DXILOpClass;
+  def domainLocation : DXILOpClass;
+  def dot2 : DXILOpClass;
+  def dot2AddHalf : DXILOpClass;
+  def dot3 : DXILOpClass;
+  def dot4 : DXILOpClass;
+  def dot4AddPacked : DXILOpClass;
+  def emitIndices : DXILOpClass;
+  def emitStream : DXILOpClass;
+  def emitThenCutStream : DXILOpClass;
+  def evalCentroid : DXILOpClass;
+  def evalSampleIndex : DXILOpClass;
+  def evalSnapped : DXILOpClass;
+  def finishedCrossGroupSharing : DXILOpClass;
+  def flattenedThreadIdInGroup : DXILOpClass;
+  def geometryIndex : DXILOpClass;
+  def getDimensions : DXILOpClass;
+  def getInputRecordCount : DXILOpClass;
+  def getMeshPayload : DXILOpClass;
+  def getNodeRecordPtr : DXILOpClass;
+  def getRemainingRecursionLevels : DXILOpClass;
+  def groupId : DXILOpClass;
+  def gsInstanceID : DXILOpClass;
+  def hitKind : DXILOpClass;
+  def ignoreHit : DXILOpClass;
+  def incrementOutputCount : DXILOpClass;
+  def indexNodeHandle : DXILOpClass;
+  def innerCoverage : DXILOpClass;
+  def instanceID : DXILOpClass;
+  def instanceIndex : DXILOpClass;
+  def isHelperLane : DXILOpClass;
+  def isSpecialFloat : DXILOpClass;
+  def legacyDoubleToFloat : DXILOpClass;
+  def legacyDoubleToSInt32 : DXILOpClass;
+  def legacyDoubleToUInt32 : DXILOpClass;
+  def legacyF16ToF32 : DXILOpClass;
+  def legacyF32ToF16 : DXILOpClass;
+  def loadInput : DXILOpClass;
+  def loadOutputControlPoint : DXILOpClass;
+  def loadPatchConstant : DXILOpClass;
+  def makeDouble : DXILOpClass;
+  def minPrecXRegLoad : DXILOpClass;
+  def minPrecXRegStore : DXILOpClass;
+  def nodeOutputIsValid : DXILOpClass;
+  def objectRayDirection : DXILOpClass;
+  def objectRayOrigin : DXILOpClass;
+  def objectToWorld : DXILOpClass;
+  def outputComplete : DXILOpClass;
+  def outputControlPointID : DXILOpClass;
+  def pack4x8 : DXILOpClass;
+  def primitiveID : DXILOpClass;
+  def primitiveIndex : DXILOpClass;
+  def quadOp : DXILOpClass;
+  def quadReadLaneAt : DXILOpClass;
+  def quadVote : DXILOpClass;
+  def quaternary : DXILOpClass;
+  def rawBufferLoad : DXILOpClass;
+  def rawBufferStore : DXILOpClass;
+  def rayFlags : DXILOpClass;
+  def rayQuery_Abort : DXILOpClass;
+  def rayQuery_CommitNonOpaqueTriangleHit : DXILOpClass;
+  def rayQuery_CommitProceduralPrimitiveHit : DXILOpClass;
+  def rayQuery_Proceed : DXILOpClass;
+  def rayQuery_StateMatrix : DXILOpClass;
+  def rayQuery_StateScalar : DXILOpClass;
+  def rayQuery_StateVector : DXILOpClass;
+  def rayQuery_TraceRayInline : DXILOpClass;
+  def rayTCurrent : DXILOpClass;
+  def rayTMin : DXILOpClass;
+  def renderTargetGetSampleCount : DXILOpClass;
+  def renderTargetGetSamplePosition : DXILOpClass;
+  def reportHit : DXILOpClass;
+  def sample : DXILOpClass;
+  def sampleBias : DXILOpClass;
+  def sampleCmp : DXILOpClass;
+  def sampleCmpBias : DXILOpClass;
+  def sampleCmpGrad : DXILOpClass;
+  def sampleCmpLevel : DXILOpClass;
+  def sampleCmpLevelZero : DXILOpClass;
+  def sampleGrad : DXILOpClass;
+  def sampleIndex : DXILOpClass;
+  def sampleLevel : DXILOpClass;
+  def setMeshOutputCounts : DXILOpClass;
+  def splitDouble : DXILOpClass;
+  def startInstanceLocation : DXILOpClass;
+  def startVertexLocation : DXILOpClass;
+  def storeOutput : DXILOpClass;
+  def storePatchConstant : DXILOpClass;
+  def storePrimitiveOutput : DXILOpClass;
+  def storeVertexOutput : DXILOpClass;
+  def tempRegLoad : DXILOpClass;
+  def tempRegStore : DXILOpClass;
+  def tertiary : DXILOpClass;
+  def texture2DMSGetSamplePosition : DXILOpClass;
+  def textureGather : DXILOpClass;
+  def textureGatherCmp : DXILOpClass;
+  def textureGatherRaw : DXILOpClass;
+  def textureLoad : DXILOpClass;
+  def textureStore : DXILOpClass;
+  def textureStoreSample : DXILOpClass;
+  def threadId : DXILOpClass;
+  def threadIdInGroup : DXILOpClass;
+  def traceRay : DXILOpClass;
+  def unary : DXILOpClass;
+  def unaryBits : DXILOpClass;
+  def unpack4x8 : DXILOpClass;
+  def viewID : DXILOpClass;
+  def waveActiveAllEqual : DXILOpClass;
+  def waveActiveBallot : DXILOpClass;
+  def waveActiveBit : DXILOpClass;
+  def waveActiveOp : DXILOpClass;
+  def waveAllOp : DXILOpClass;
+  def waveAllTrue : DXILOpClass;
+  def waveAnyTrue : DXILOpClass;
+  def waveGetLaneCount : DXILOpClass;
+  def waveGetLaneIndex : DXILOpClass;
+  def waveIsFirstLane : DXILOpClass;
+  def waveMatch : DXILOpClass;
+  def waveMatrix_Accumulate : DXILOpClass;
+  def waveMatrix_Annotate : DXILOpClass;
+  def waveMatrix_Depth : DXILOpClass;
+  def waveMatrix_Fill : DXILOpClass;
+  def waveMatrix_LoadGroupShared : DXILOpClass;
+  def waveMatrix_LoadRawBuf : DXILOpClass;
+  def waveMatrix_Multiply : DXILOpClass;
+  def waveMatrix_ScalarOp : DXILOpClass;
+  def waveMatrix_StoreGroupShared : DXILOpClass;
+  def waveMatrix_StoreRawBuf : DXILOpClass;
+  def waveMultiPrefixBitCount : DXILOpClass;
+  def waveMultiPrefixOp : DXILOpClass;
+  def wavePrefixOp : DXILOpClass;
+  def waveReadLaneAt : DXILOpClass;
+  def waveReadLaneFirst : DXILOpClass;
+  def worldRayDirection : DXILOpClass;
+  def worldRayOrigin : DXILOpClass;
+  def worldToObject : DXILOpClass;
+  def writeSamplerFeedback : DXILOpClass;
+  def writeSamplerFeedbackBias : DXILOpClass;
+  def writeSamplerFeedbackGrad : DXILOpClass;
+  def writeSamplerFeedbackLevel: DXILOpClass;
 }
 
-// Abstract representation of the category a DXIL Operation belongs to
-class DXILOpCategory<string name> {
-  string Name = name;
+// Abstraction DXIL Operation to LLVM intrinsic
+class DXILOpMapping<int opCode, DXILOpClass opClass, Intrinsic intrinsic, string doc> {
+  int OpCode = opCode;                 // Opcode corresponding to DXIL Operation
+  DXILOpClass OpClass = opClass;             // Class of DXIL Operation.
+  Intrinsic LLVMIntrinsic = intrinsic; // LLVM Intrinsic the DXIL Operation maps
+  string Doc = doc;                    // to a short description of the operation
 }
 
-def UnaryClass : DXILOpClass<"Unary">;
-def BinaryClass : DXILOpClass<"Binary">;
-def FlattenedThreadIdInGroupClass : DXILOpClass<"FlattenedThreadIdInGroup">;
-def ThreadIdInGroupClass : DXILOpClass<"ThreadIdInGroup">;
-def ThreadIdClass : DXILOpClass<"ThreadId">;
-def GroupIdClass : DXILOpClass<"GroupId">;
-
-def BinaryUintCategory : DXILOpCategory<"Binary uint">;
-def UnaryFloatCategory : DXILOpCategory<"Unary float">;
-def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">;
-
-// Represent as any pointer type with an option to change to a qualified pointer
-// type with address space specified.
-def dxil_handle_ty  : LLVMAnyPointerType;
-def dxil_cbuffer_ty : LLVMAnyPointerType;
-def dxil_resource_ty : LLVMAnyPointerType;
-
-// The parameter description for a DXIL operation
-class DXILOpParameter<int pos, LLVMType type, string name, string doc,
-                 bit isConstant = 0, string enumName = "",
-                 int maxValue = 0> {
-  int Pos = pos;               // Position in parameter list
-  LLVMType ParamType = type;   // Parameter type
-  string Name = name;          // Short, unique parameter name
-  string Doc = doc;            // Description of this parameter
-  bit IsConstant = isConstant; // Whether this parameter requires a constant value in the IR
-  string EnumName = enumName;  // Name of the enum type, if applicable
-  int MaxValue = maxValue;     // Maximum value for this parameter, if applicable
-}
-
-// A representation for a DXIL operation
-class DXILOperationDesc {
-  string OpName = "";         // Name of DXIL operation
-  int OpCode = 0;             // Unique non-negative integer associated with the operation
-  DXILOpClass  OpClass;       // Class of the operation
-  DXILOpCategory OpCategory;  // Category of the operation
-  string Doc = "";            // Description of the operation
-  list<DXILOpParameter> Params = []; // Parameter list of the operation
-  list<LLVMType> OverloadTypes = [];  // Overload types, if applicable
-  EnumAttr Attribute;         // Operation Attribute. Leverage attributes defined in Attributes.td
-                              // ReadNone - operation does not access memory.
-                              // ReadOnly - only reads from memory.
-                              // "ReadMemory"   - reads memory
-  bit IsDerivative = 0;       // Whether this is some kind of derivative
-  bit IsGradient = 0;         // Whether this requires a gradient calculation
-  bit IsFeedback = 0;         // Whether this is a sampler feedback operation
-  bit IsWave = 0;             // Whether this requires in-wave, cross-lane functionality
-  bit NeedsUniformInputs = 0; // Whether this operation requires that all
-                              // of its inputs are uniform across the wave
-  // Group DXIL operation for stats - e.g., to accumulate the number of atomic/float/uint/int/...
-  // operations used in the program.
-  list<string> StatsGroup = [];
-}
-
-class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory opCategory, string doc,
-              list<LLVMType> oloadTypes, EnumAttr attrs, list<DXILOpParameter> params,
-              list<string> statsGroup = []> : DXILOperationDesc {
-  let OpName = name;
-  let OpCode = opCode;
-  let Doc = doc;
-  let Params = params;
-  let OpClass = opClass;
-  let OpCategory = opCategory;
-  let OverloadTypes = oloadTypes;
-  let Attribute = attrs;
-  let StatsGroup = statsGroup;
-}
-
-// LLVM intrinsic that DXIL operation maps to.
-class LLVMIntrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; }
-
-def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.",
-  [llvm_half_ty, llvm_float_ty], ReadNone,
-  [
-    DXILOpParameter<0, llvm_anyfloat_ty, "", "operation result">,
-    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
-    DXILOpParameter<2, llvm_anyfloat_ty, "value", "input value">
-  ],
-  ["floats"]>,
-  LLVMIntrinsic<int_sin>;
-
-def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
-    [llvm_i16_ty, llvm_i32_ty, llvm_i64_ty], ReadNone,
-  [
-    DXILOpParameter<0, llvm_anyint_ty, "", "operation result">,
-    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
-    DXILOpParameter<2, llvm_anyint_ty, "a", "input value">,
-    DXILOpParameter<3, llvm_anyint_ty, "b", "input value">
-  ],
-  ["uints"]>,
-  LLVMIntrinsic<int_umax>;
-
-def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [llvm_i32_ty], ReadNone,
-  [
-    DXILOpParameter<0, llvm_i32_ty, "", "thread ID component">,
-    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
-    DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)">
-  ]>,
-  LLVMIntrinsic<int_dx_thread_id>;
-
-def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [llvm_i32_ty], ReadNone,
-  [
-    DXILOpParameter<0, llvm_i32_ty, "", "group ID component">,
-    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
-    DXILOpParameter<2, llvm_i32_ty, "component", "component to read">
-  ]>,
-  LLVMIntrinsic<int_dx_group_id>;
-
-def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory,
-  "reads the thread ID within the group (SV_GroupThreadID)", [llvm_i32_ty], ReadNone,
-  [
-    DXILOpParameter<0, llvm_i32_ty, "", "thread ID in group component">,
-    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
-    DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)">
-  ]>,
-  LLVMIntrinsic<int_dx_thread_id_in_group>;
-
-def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory,
-   "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [llvm_i32_ty], ReadNone,
-  [
-    DXILOpParameter<0, llvm_i32_ty, "", "result">,
-    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">
-  ]>,
-  LLVMIntrinsic<int_dx_flattened_thread_id_in_group>;
+// Concrete definition of DXIL Operation mapping to corresponding LLVM intrinsic
+def Sin  : DXILOpMapping<13, unary, int_sin,
+                         "Returns sine(theta) for theta in radians.">;
+def UMax : DXILOpMapping<39, binary, int_umax,
+                         "Unsigned integer maximum. UMax(a,b) = a > b ? a : b">;
+def ThreadId : DXILOpMapping<93, threadId, int_dx_thread_id,
+                             "Reads the thread ID">;
+def GroupId  : DXILOpMapping<94, groupId, int_dx_group_id,
+                             "Reads the group ID (SV_GroupID)">;
+def ThreadIdInGroup : DXILOpMapping<95, threadIdInGroup,
+                                    int_dx_thread_id_in_group,
+                                    "Reads the thread ID within the group "
+                                    "(SV_GroupThreadID)">;
+def FlattenedThreadIdInGroup : DXILOpMapping<96, flattenedThreadIdInGroup,
+                                             int_dx_flattened_thread_id_in_group,
+                                             "Provides a flattened index for a "
+                                             "given thread within a given "
+                                             "group (SV_GroupIndex)">;
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index 42180a865b72e3..21a20d45b922d9 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -221,12 +221,26 @@ static Type *getTypeFromParameterKind(ParameterKind Kind, Type *OverloadTy) {
   return nullptr;
 }
 
+/// Construct DXIL function type. This is the type of a function with
+/// the following prototype
+///     OverloadType dx.op.<opclass>.<return-type>(int opcode, <param types>)
+/// <param-types> are constructed from types in Prop.
+/// \param Prop  Structure containing DXIL Operation properties based on
+///               its specification in DXIL.td.
+/// \param OverloadTy Return type to be used to construct DXIL function type.
 static FunctionType *getDXILOpFunctionType(const OpCodeProperty *Prop,
                                            Type *OverloadTy) {
   SmallVector<Type *> ArgTys;
 
   auto ParamKinds = getOpCodeParameterKind(*Prop);
 
+  // Add OverloadTy as return type of the function
+  ArgTys.emplace_back(OverloadTy);
+
+  // Add DXIL Opcode value type viz., Int32 as first argument
+  ArgTys.emplace_back(Type::getInt32Ty(OverloadTy->getContext()));
+
+  // Add DXIL Operation parameter types as specified in DXIL properties
   for (unsigned I = 0; I < Prop->NumOfParameters; ++I) {
     ParameterKind Kind = ParamKinds[I];
     ArgTys.emplace_back(getTypeFromParameterKind(Kind, OverloadTy));
@@ -267,13 +281,13 @@ CallInst *DXILOpBuilder::createDXILOpCall(dxil::OpCode OpCode, Type *OverloadTy,
   return B.CreateCall(Fn, FullArgs);
 }
 
-Type *DXILOpBuilder::getOverloadTy(dxil::OpCode OpCode, FunctionType *FT,
-                                   bool NoOpCodeParam) {
+Type *DXILOpBuilder::getOverloadTy(dxil::OpCode OpCode, FunctionType *FT) {
 
   const OpCodeProperty *Prop = getOpCodeProperty(OpCode);
+  // If DXIL Op has no overload parameter, just return the
+  // precise return type specified.
   if (Prop->OverloadParamIndex < 0) {
     auto &Ctx = FT->getContext();
-    // When only has 1 overload type, just return it.
     switch (Prop->OverloadTys) {
     case OverloadKind::VOID:
       return Type::getVoidTy(Ctx);
@@ -302,9 +316,8 @@ Type *DXILOpBuilder::getOverloadTy(dxil::OpCode OpCode, FunctionType *FT,
   // Prop->OverloadParamIndex is 0, overload type is FT->getReturnType().
   Type *OverloadType = FT->getReturnType();
   if (Prop->OverloadParamIndex != 0) {
-    // Skip Return Type and Type for DXIL opcode.
-    const unsigned SkipedParam = NoOpCodeParam ? 2 : 1;
-    OverloadType = FT->getParamType(Prop->OverloadParamIndex - SkipedParam);
+    // Skip Return Type.
+    OverloadType = FT->getParamType(Prop->OverloadParamIndex - 1);
   }
 
   auto ParamKinds = getOpCodeParameterKind(*Prop);
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.h b/llvm/lib/Target/DirectX/DXILOpBuilder.h
index 940ed538c7ce15..1c15f109184adf 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.h
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.h
@@ -31,8 +31,7 @@ class DXILOpBuilder {
   DXILOpBuilder(Module &M, IRBuilderBase &B) : M(M), B(B) {}
   CallInst *createDXILOpCall(dxil::OpCode OpCode, Type *OverloadTy,
                              llvm::iterator_range<Use *> Args);
-  Type *getOverloadTy(dxil::OpCode OpCode, FunctionType *FT,
-                      bool NoOpCodeParam);
+  Type *getOverloadTy(dxil::OpCode OpCode, FunctionType *FT);
   static const char *getOpCodeName(dxil::OpCode DXILOp);
 
 private:
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index f6e2297e9af41f..6b649b76beecdf 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -33,8 +33,7 @@ static void lowerIntrinsic(dxil::OpCode DXILOp, Function &F, Module &M) {
   IRBuilder<> B(M.getContext());
   Value *DXILOpArg = B.getInt32(static_cast<unsigned>(DXILOp));
   DXILOpBuilder DXILB(M, B);
-  Type *OverloadTy =
-      DXILB.getOverloadTy(DXILOp, F.getFunctionType(), /*NoOpCodeParam*/ true);
+  Type *OverloadTy = DXILB.getOverloadTy(DXILOp, F.getFunctionType());
   for (User *U : make_early_inc_range(F.users())) {
     CallInst *CI = dyn_cast<CallInst>(U);
     if (!CI)
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index d47df597d53a35..fc958f5328736c 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -11,11 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenTarget.h"
 #include "SequenceToOffsetTable.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGenTypes/MachineValueType.h"
 #include "llvm/Support/DXILABI.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -30,28 +33,15 @@ struct DXILShaderModel {
   int Minor = 0;
 };
 
-struct DXILParameter {
-  int Pos; // position in parameter list
-  ParameterKind Kind;
-  StringRef Name; // short, unique name
-  StringRef Doc;  // the documentation description of this parameter
-  bool IsConst;   // whether this argument requires a constant value in the IR
-  StringRef EnumName; // the name of the enum type if applicable
-  int MaxValue;       // the maximum value for this parameter if applicable
-  DXILParameter(const Record *R);
-};
-
 struct DXILOperationDesc {
-  StringRef OpName;   // name of DXIL operation
+  std::string OpName; // name of DXIL operation
   int OpCode;         // ID of DXIL operation
   StringRef OpClass;  // name of the opcode class
-  StringRef Category; // classification for this instruction
   StringRef Doc;      // the documentation description of this instruction
-
-  SmallVector<DXILParameter> Params; // the operands that this instruction takes
-  SmallVector<ParameterKind> OverloadTypes; // overload types if applicable
-  StringRef Attr; // operation attribute; reference to string representation
-                  // of llvm::Attribute::AttrKind
+  SmallVector<MVT::SimpleValueType> OpTypes; // Vector of operand types -
+                                             // return type is at index 0
+  SmallVector<std::string>
+      OpAttributes;     // operation attribute represented as strings
   StringRef Intrinsic;  // The llvm intrinsic map to OpName. Default is "" which
                         // means no map exists
   bool IsDeriv = false; // whether this is some kind of derivative
@@ -74,81 +64,99 @@ struct DXILOperationDesc {
 };
 } // end anonymous namespace
 
-/*!
- Convert DXIL type name string to dxil::ParameterKind
-
- @param typeNameStr Type name string
- @return ParameterKind As defined in llvm/Support/DXILABI.h
-*/
-static ParameterKind lookupParameterKind(StringRef typeNameStr) {
-  auto paramKind = StringSwitch<ParameterKind>(typeNameStr)
-                       .Case("llvm_void_ty", ParameterKind::VOID)
-                       .Case("llvm_half_ty", ParameterKind::HALF)
-                       .Case("llvm_float_ty", ParameterKind::FLOAT)
-                       .Case("llvm_double_ty", ParameterKind::DOUBLE)
-                       .Case("llvm_i1_ty", ParameterKind::I1)
-                       .Case("llvm_i8_ty", ParameterKind::I8)
-                       .Case("llvm_i16_ty", ParameterKind::I16)
-                       .Case("llvm_i32_ty", ParameterKind::I32)
-                       .Case("llvm_i64_ty", ParameterKind::I64)
-                       .Case("llvm_anyfloat_ty", ParameterKind::OVERLOAD)
-                       .Case("llvm_anyint_ty", ParameterKind::OVERLOAD)
-                       .Case("dxil_handle_ty", ParameterKind::DXIL_HANDLE)
-                       .Case("dxil_cbuffer_ty", ParameterKind::CBUFFER_RET)
-                       .Case("dxil_resource_ty", ParameterKind::RESOURCE_RET)
-                       .Default(ParameterKind::INVALID);
-  assert(paramKind != ParameterKind::INVALID &&
-         "Unsupported DXIL Type specified");
-  return paramKind;
+/// Convert DXIL type name string to dxil::ParameterKind
+///
+/// \param VT Simple Value Type
+/// \return ParameterKind As defined in llvm/Support/DXILABI.h
+
+static ParameterKind getParameterKind(MVT::SimpleValueType VT) {
+  switch (VT) {
+  case MVT::isVoid:
+    return ParameterKind::VOID;
+  case MVT::f16:
+    return ParameterKind::HALF;
+  case MVT::f32:
+    return ParameterKind::FLOAT;
+  case MVT::f64:
+    return ParameterKind::DOUBLE;
+  case MVT::i1:
+    return ParameterKind::I1;
+  case MVT::i8:
+    return ParameterKind::I8;
+  case MVT::i16:
+    return ParameterKind::I16;
+  case MVT::i32:
+    return ParameterKind::I32;
+  case MVT::fAny:
+  case MVT::iAny:
+    return ParameterKind::OVERLOAD;
+  default:
+    llvm_unreachable("Support for specified DXIL Type not yet implemented");
+  }
 }
 
+/// Construct an object using the DXIL Operation records specified
+/// in DXIL.td. This serves as the single source of reference of
+/// the information extracted from the specified Record R, for
+/// C++ code generated by this TableGen backend.
+//  \param R Object representing TableGen record of a DXIL Operation
 DXILOperationDesc::DXILOperationDesc(const Record *R) {
-  OpName = R->getValueAsString("OpName");
+  OpName = R->getNameInitAsString();
   OpCode = R->getValueAsInt("OpCode");
-  OpClass = R->getValueAsDef("OpClass")->getValueAsString("Name");
-  Category = R->getValueAsDef("OpCategory")->getValueAsString("Name");
 
-  if (R->getValue("llvm_intrinsic")) {
-    auto *IntrinsicDef = R->getValueAsDef("llvm_intrinsic");
+  Doc = R->getValueAsString("Doc");
+
+  if (R->getValue("LLVMIntrinsic")) {
+    auto *IntrinsicDef = R->getValueAsDef("LLVMIntrinsic");
     auto DefName = IntrinsicDef->getName();
     assert(DefName.starts_with("int_") && "invalid intrinsic name");
     // Remove the int_ from intrinsic name.
     Intrinsic = DefName.substr(4);
+    // TODO: It is expected that return type and parameter types of
+    // DXIL Operation are the same as that of the intrinsic. Deviations
+    // are expected to be encoded in TableGen record specification and
+    // handled accordingly here. Support to be added later, as needed.
+    // Get parameter type list of the intrinsic. Types attribute contains
+    // the list of as [returnType, param1Type,, param2Type, ...]
+
+    OverloadParamIndex = -1;
+    auto TypeRecs = IntrinsicDef->getValueAsListOfDefs("Types");
+    unsigned TypeRecsSize = TypeRecs.size();
+    // Populate return type and parameter type names
+    for (unsigned i = 0; i < TypeRecsSize; i++) {
+      auto TR = TypeRecs[i];
+      OpTypes.emplace_back(getValueType(TR->getValueAsDef("VT")));
+      // Get the overload parameter index.
+      // TODO : Seems hacky. Is it possible that more than one parameter can
+      // be of overload kind??
+      // TODO: Check for any additional constraints specified for DXIL operation
+      // restricting return type.
+      if (i > 0) {
+        auto &CurParam = OpTypes.back();
+        if (getParameterKind(CurParam) >= ParameterKind::OVERLOAD) {
+          OverloadParamIndex = i;
+        }
+      }
+    }
+    // Get the operation class
+    OpClass = R->getValueAsDef("OpClass")->getName();
+
+    // NOTE: For now, assume that attributes of DXIL Operation are the same as
+    // that of the intrinsic. Deviations are expected to be encoded in TableGen
+    // record specification and handled accordingly here. Support to be added
+    // later.
+    auto IntrPropList = IntrinsicDef->getValueAsListInit("IntrProperties");
+    auto IntrPropListSize = IntrPropList->size();
+    for (unsigned i = 0; i < IntrPropListSize; i++) {
+      OpAttributes.emplace_back(IntrPropList->getElement(i)->getAsString());
+    }
   }
-
-  Doc = R->getValueAsString("Doc");
-
-  ListInit *ParamList = R->getValueAsListInit("Params");
-  OverloadParamIndex = -1;
-  for (unsigned I = 0; I < ParamList->size(); ++I) {
-    Record *Param = ParamList->getElementAsRecord(I);
-    Params.emplace_back(DXILParameter(Param));
-    auto &CurParam = Params.back();
-    if (CurParam.Kind >= ParameterKind::OVERLOAD)
-      OverloadParamIndex = I;
-  }
-  ListInit *OverloadTypeList = R->getValueAsListInit("OverloadTypes");
-
-  for (unsigned I = 0; I < OverloadTypeList->size(); ++I) {
-    Record *R = OverloadTypeList->getElementAsRecord(I);
-    OverloadTypes.emplace_back(lookupParameterKind(R->getNameInitAsString()));
-  }
-  Attr = StringRef(R->getValue("Attribute")->getNameInitAsString());
 }
 
-DXILParameter::DXILParameter(const Record *R) {
-  Name = R->getValueAsString("Name");
-  Pos = R->getValueAsInt("Pos");
-  Kind =
-      lookupParameterKind(R->getValue("ParamType")->getValue()->getAsString());
-  if (R->getValue("Doc"))
-    Doc = R->getValueAsString("Doc");
-  IsConst = R->getValueAsBit("IsConstant");
-  EnumName = R->getValueAsString("EnumName");
-  MaxValue = R->getValueAsInt("MaxValue");
-}
-
-static std::string parameterKindToString(ParameterKind Kind) {
+/// Return a string representation of ParameterKind enum
+/// \param Kind Parameter Kind enum value
+/// \return std::string string representation of input Kind
+static std::string getParameterKindStr(ParameterKind Kind) {
   switch (Kind) {
   case ParameterKind::INVALID:
     return "INVALID";
@@ -182,92 +190,77 @@ static std::string parameterKindToString(ParameterKind Kind) {
   llvm_unreachable("Unknown llvm::dxil::ParameterKind enum");
 }
 
-static void emitDXILOpEnum(DXILOperationDesc &Op, raw_ostream &OS) {
-  // Name = ID, // Doc
-  OS << Op.OpName << " = " << Op.OpCode << ", // " << Op.Doc << "\n";
-}
+/// Return a string representation of OverloadKind enum that maps to
+/// input Simple Value Type enum
+/// \param VT Simple Value Type enum
+/// \return std::string string representation of OverloadKind
 
-static std::string buildCategoryStr(StringSet<> &Cetegorys) {
-  std::string Str;
-  raw_string_ostream OS(Str);
-  for (auto &It : Cetegorys) {
-    OS << " " << It.getKey();
+static std::string getOverloadKindStr(MVT::SimpleValueType VT) {
+  switch (VT) {
+  case MVT::isVoid:
+    return "OverloadKind::VOID";
+  case MVT::f16:
+    return "OverloadKind::HALF";
+  case MVT::f32:
+    return "OverloadKind::FLOAT";
+  case MVT::f64:
+    return "OverloadKind::DOUBLE";
+  case MVT::i1:
+    return "OverloadKind::I1";
+  case MVT::i8:
+    return "OverloadKind::I8";
+  case MVT::i16:
+    return "OverloadKind::I16";
+  case MVT::i32:
+    return "OverloadKind::I32";
+  case MVT::i64:
+    return "OverloadKind::I64";
+  case MVT::iAny:
+    return "OverloadKind::I16 | OverloadKind::I32 | OverloadKind::I64";
+  case MVT::fAny:
+    return "OverloadKind::HALF | OverloadKind::FLOAT | OverloadKind::DOUBLE";
+  default:
+    llvm_unreachable(
+        "Support for specified parameter OverloadKind not yet implemented");
   }
-  return OS.str();
 }
 
-// Emit enum declaration for DXIL.
+/// Emit Enums of DXIL Ops
+/// \param A vector of DXIL Ops
+/// \param Output stream
 static void emitDXILEnums(std::vector<DXILOperationDesc> &Ops,
                           raw_ostream &OS) {
-  // Sort by Category + OpName.
+  // Sort by OpCode
   llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) {
-    // Group by Category first.
-    if (A.Category == B.Category)
-      // Inside same Category, order by OpName.
-      return A.OpName < B.OpName;
-    else
-      return A.Category < B.Category;
+    return A.OpCode < B.OpCode;
   });
 
   OS << "// Enumeration for operations specified by DXIL\n";
   OS << "enum class OpCode : unsigned {\n";
 
-  StringMap<StringSet<>> ClassMap;
-  StringRef PrevCategory = "";
   for (auto &Op : Ops) {
-    StringRef Category = Op.Category;
-    if (Category != PrevCategory) {
-      OS << "\n// " << Category << "\n";
-      PrevCategory = Category;
-    }
-    emitDXILOpEnum(Op, OS);
-    auto It = ClassMap.find(Op.OpClass);
-    if (It != ClassMap.end()) {
-      It->second.insert(Op.Category);
-    } else {
-      ClassMap[Op.OpClass].insert(Op.Category);
-    }
+    // Name = ID, // Doc
+    OS << Op.OpName << " = " << Op.OpCode << ", // " << Op.Doc << "\n";
   }
 
   OS << "\n};\n\n";
 
-  std::vector<std::pair<std::string, std::string>> ClassVec;
-  for (auto &It : ClassMap) {
-    ClassVec.emplace_back(
-        std::pair(It.getKey().str(), buildCategoryStr(It.second)));
-  }
-  // Sort by Category + ClassName.
-  llvm::sort(ClassVec, [](std::pair<std::string, std::string> &A,
-                          std::pair<std::string, std::string> &B) {
-    StringRef ClassA = A.first;
-    StringRef CategoryA = A.second;
-    StringRef ClassB = B.first;
-    StringRef CategoryB = B.second;
-    // Group by Category first.
-    if (CategoryA == CategoryB)
-      // Inside same Category, order by ClassName.
-      return ClassA < ClassB;
-    else
-      return CategoryA < CategoryB;
-  });
-
   OS << "// Groups for DXIL operations with equivalent function templates\n";
   OS << "enum class OpCodeClass : unsigned {\n";
-  PrevCategory = "";
-  for (auto &It : ClassVec) {
-
-    StringRef Category = It.second;
-    if (Category != PrevCategory) {
-      OS << "\n// " << Category << "\n";
-      PrevCategory = Category;
-    }
-    StringRef Name = It.first;
-    OS << Name << ",\n";
+  // Build an OpClass set to print
+  SmallSet<StringRef, 2> OpClassSet;
+  for (auto &Op : Ops) {
+    OpClassSet.insert(Op.OpClass);
+  }
+  for (auto &C : OpClassSet) {
+    OS << C << ",\n";
   }
   OS << "\n};\n\n";
 }
 
-// Emit map from llvm intrinsic to DXIL operation.
+/// Emit map of DXIL operation to LLVM or DirectX intrinsic
+/// \param A vector of DXIL Ops
+/// \param Output stream
 static void emitDXILIntrinsicMap(std::vector<DXILOperationDesc> &Ops,
                                  raw_ostream &OS) {
   OS << "\n";
@@ -285,75 +278,27 @@ static void emitDXILIntrinsicMap(std::vector<DXILOperationDesc> &Ops,
   OS << "\n";
 }
 
-/*!
- Convert operation attribute string to Attribute enum
-
- @param Attr string reference
- @return std::string Attribute enum string
- */
-static std::string emitDXILOperationAttr(StringRef Attr) {
-  return StringSwitch<std::string>(Attr)
-      .Case("ReadNone", "Attribute::ReadNone")
-      .Case("ReadOnly", "Attribute::ReadOnly")
-      .Default("Attribute::None");
-}
-
-static std::string overloadKindStr(ParameterKind Overload) {
-  switch (Overload) {
-  case ParameterKind::HALF:
-    return "OverloadKind::HALF";
-  case ParameterKind::FLOAT:
-    return "OverloadKind::FLOAT";
-  case ParameterKind::DOUBLE:
-    return "OverloadKind::DOUBLE";
-  case ParameterKind::I1:
-    return "OverloadKind::I1";
-  case ParameterKind::I8:
-    return "OverloadKind::I8";
-  case ParameterKind::I16:
-    return "OverloadKind::I16";
-  case ParameterKind::I32:
-    return "OverloadKind::I32";
-  case ParameterKind::I64:
-    return "OverloadKind::I64";
-  case ParameterKind::VOID:
-    return "OverloadKind::VOID";
-  default:
-    return "OverloadKind::UNKNOWN";
-  }
-}
-
-static std::string
-getDXILOperationOverloads(SmallVector<ParameterKind> Overloads) {
-  // Format is: OverloadKind::FLOAT | OverloadKind::HALF
-  auto It = Overloads.begin();
-  std::string Result;
-  raw_string_ostream OS(Result);
-  OS << overloadKindStr(*It);
-  for (++It; It != Overloads.end(); ++It) {
-    OS << " | " << overloadKindStr(*It);
+/// Convert operation attribute string to Attribute enum
+///
+/// \param Attr string reference
+/// \return std::string Attribute enum string
+
+static std::string emitDXILOperationAttr(SmallVector<std::string> Attrs) {
+  for (auto Attr : Attrs) {
+    // TODO: For now just recognize IntrNoMem and IntrReadMem as valid and
+    //  ignore others.
+    if (Attr == "IntrNoMem") {
+      return "Attribute::ReadNone";
+    } else if (Attr == "IntrReadMem") {
+      return "Attribute::ReadOnly";
+    }
   }
-  return OS.str();
-}
-
-static std::string lowerFirstLetter(StringRef Name) {
-  if (Name.empty())
-    return "";
-
-  std::string LowerName = Name.str();
-  LowerName[0] = llvm::toLower(Name[0]);
-  return LowerName;
-}
-
-static std::string getDXILOpClassName(StringRef OpClass) {
-  // Lower first letter expect for special case.
-  return StringSwitch<std::string>(OpClass)
-      .Case("CBufferLoad", "cbufferLoad")
-      .Case("CBufferLoadLegacy", "cbufferLoadLegacy")
-      .Case("GSInstanceID", "gsInstanceID")
-      .Default(lowerFirstLetter(OpClass));
+  return "Attribute::None";
 }
 
+/// Emit DXIL operation table
+/// \param A vector of DXIL Ops
+/// \param Output stream
 static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
                                    raw_ostream &OS) {
   // Sort by OpCode.
@@ -369,15 +314,16 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
   StringMap<SmallVector<ParameterKind>> ParameterMap;
   StringSet<> ClassSet;
   for (auto &Op : Ops) {
-    OpStrings.add(Op.OpName.str());
+    OpStrings.add(Op.OpName);
 
     if (ClassSet.contains(Op.OpClass))
       continue;
     ClassSet.insert(Op.OpClass);
-    OpClassStrings.add(getDXILOpClassName(Op.OpClass));
+    OpClassStrings.add(Op.OpClass.data());
     SmallVector<ParameterKind> ParamKindVec;
-    for (auto &Param : Op.Params) {
-      ParamKindVec.emplace_back(Param.Kind);
+    // ParamKindVec is a vector of parameters. Skip return type at index 0
+    for (unsigned i = 1; i < Op.OpTypes.size(); i++) {
+      ParamKindVec.emplace_back(getParameterKind(Op.OpTypes[i]));
     }
     ParameterMap[Op.OpClass] = ParamKindVec;
     Parameters.add(ParamKindVec);
@@ -389,7 +335,7 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
   Parameters.layout();
 
   // Emit the DXIL operation table.
-  //{dxil::OpCode::Sin, OpCodeNameIndex, OpCodeClass::Unary,
+  //{dxil::OpCode::Sin, OpCodeNameIndex, OpCodeClass::unary,
   // OpCodeClassNameIndex,
   // OverloadKind::FLOAT | OverloadKind::HALF, Attribute::AttrKind::ReadNone, 0,
   // 3, ParameterTableOffset},
@@ -398,12 +344,12 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
 
   OS << "  static const OpCodeProperty OpCodeProps[] = {\n";
   for (auto &Op : Ops) {
-    OS << "  { dxil::OpCode::" << Op.OpName << ", "
-       << OpStrings.get(Op.OpName.str()) << ", OpCodeClass::" << Op.OpClass
-       << ", " << OpClassStrings.get(getDXILOpClassName(Op.OpClass)) << ", "
-       << getDXILOperationOverloads(Op.OverloadTypes) << ", "
-       << emitDXILOperationAttr(Op.Attr) << ", " << Op.OverloadParamIndex
-       << ", " << Op.Params.size() << ", "
+    OS << "  { dxil::OpCode::" << Op.OpName << ", " << OpStrings.get(Op.OpName)
+       << ", OpCodeClass::" << Op.OpClass << ", "
+       << OpClassStrings.get(Op.OpClass.data()) << ", "
+       << getOverloadKindStr(Op.OpTypes[0]) << ", "
+       << emitDXILOperationAttr(Op.OpAttributes) << ", "
+       << Op.OverloadParamIndex << ", " << Op.OpTypes.size() - 1 << ", "
        << Parameters.get(ParameterMap[Op.OpClass]) << " },\n";
   }
   OS << "  };\n";
@@ -418,7 +364,7 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
         "OpCodeProperty &B) {\n";
   OS << "                          return A.OpCode < B.OpCode;\n";
   OS << "                        });\n";
-  OS << "  assert(Prop && \"fail to find OpCodeProperty\");\n";
+  OS << "  assert(Prop && \"failed to find OpCodeProperty\");\n";
   OS << "  return Prop;\n";
   OS << "}\n\n";
 
@@ -450,7 +396,7 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
   Parameters.emit(
       OS,
       [](raw_ostream &ParamOS, ParameterKind Kind) {
-        ParamOS << "ParameterKind::" << parameterKindToString(Kind);
+        ParamOS << "ParameterKind::" << getParameterKindStr(Kind);
       },
       "ParameterKind::INVALID");
   OS << "  };\n\n";
@@ -459,30 +405,28 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
   OS << "}\n ";
 }
 
+/// Entry function call that invokes the functionality of this TableGen backend
+/// \param Records TableGen records of DXIL Operations defined in DXIL.td
+/// \param OS output stream
 static void EmitDXILOperation(RecordKeeper &Records, raw_ostream &OS) {
-  std::vector<Record *> Ops = Records.getAllDerivedDefinitions("DXILOperation");
   OS << "// Generated code, do not edit.\n";
   OS << "\n";
-
+  // Get all DXIL Ops to intrinsic mapping records
+  std::vector<Record *> OpIntrMaps =
+      Records.getAllDerivedDefinitions("DXILOpMapping");
   std::vector<DXILOperationDesc> DXILOps;
-  DXILOps.reserve(Ops.size());
-  for (auto *Record : Ops) {
+  for (auto *Record : OpIntrMaps) {
     DXILOps.emplace_back(DXILOperationDesc(Record));
   }
-
   OS << "#ifdef DXIL_OP_ENUM\n";
   emitDXILEnums(DXILOps, OS);
   OS << "#endif\n\n";
-
   OS << "#ifdef DXIL_OP_INTRINSIC_MAP\n";
   emitDXILIntrinsicMap(DXILOps, OS);
   OS << "#endif\n\n";
-
   OS << "#ifdef DXIL_OP_OPERATION_TABLE\n";
   emitDXILOperationTable(DXILOps, OS);
   OS << "#endif\n\n";
-
-  OS << "\n";
 }
 
 static TableGen::Emitter::Opt X("gen-dxil-operation", EmitDXILOperation,

From ec95379df363253ffcbbda21297417e703d5ccca Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 29 Feb 2024 14:19:23 +0000
Subject: [PATCH 18/55] [lldb][test] Clear pexpect found var before checking
 again

If you run cmake without pexpect installed it errors as expected.
However, if you just `pip install pexpect` and cmake again it still
doesn't find it because it cached the result of the search.

Unset the result before looking for pexpect. So that this works
as expected:
cmake ...
pip3 install pexpect
cmake ...
---
 lldb/test/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt
index d8cbb24b6c9b81..7c31fd487f6ff8 100644
--- a/lldb/test/CMakeLists.txt
+++ b/lldb/test/CMakeLists.txt
@@ -30,6 +30,7 @@ endif()
 # LLDB tree. However, we delay the deletion of it from the tree in case
 # users/buildbots don't have the package yet and need some time to install it.
 if (NOT LLDB_TEST_USE_VENDOR_PACKAGES)
+  unset(PY_pexpect_FOUND CACHE)
   lldb_find_python_module(pexpect)
   if (NOT PY_pexpect_FOUND)
     message(FATAL_ERROR

From aadd7650447b301f8d08fe94a886df05971adecb Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 29 Feb 2024 14:37:06 +0000
Subject: [PATCH 19/55] [DebugInfo][RemoveDIs] Prevent duplicate DPValues from
 being returned by findDbgIntrinsics (#82764)

Fixes the error described here:
https://github.com/llvm/llvm-project/commit/a93a4ec7dd205b965ee5597314bb376520cd736c#commitcomment-138965199

The function `findDbgIntrinsics` is used to return a list of debug
intrinsics and DPValues that use a given value, with the intent that no
duplicates are returned in either list. For DPValues, we've guarded
against DPValues that use a value multiple times as part of a DIArgList,
but we have not guarded against DPValues that use a value multiple times
as separate operands (currently only possible for `dbg_assign`s,
something I missed in my implementation of that type!). This patch adds
a guard, and also updates a test to cover this case.
---
 llvm/lib/IR/DebugInfo.cpp                                | 7 ++++---
 llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll | 7 +++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index e044ab3230c5f9..1f3ff2246a4453 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -99,8 +99,8 @@ static void findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
   SmallPtrSet<DPValue *, 4> EncounteredDPValues;
 
   /// Append IntrinsicT users of MetadataAsValue(MD).
-  auto AppendUsers = [&Ctx, &EncounteredIntrinsics, &Result,
-                      DPValues](Metadata *MD) {
+  auto AppendUsers = [&Ctx, &EncounteredIntrinsics, &EncounteredDPValues,
+                      &Result, DPValues](Metadata *MD) {
     if (auto *MDV = MetadataAsValue::getIfExists(Ctx, MD)) {
       for (User *U : MDV->users())
         if (IntrinsicT *DVI = dyn_cast<IntrinsicT>(U))
@@ -113,7 +113,8 @@ static void findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
     if (LocalAsMetadata *L = dyn_cast<LocalAsMetadata>(MD)) {
       for (DPValue *DPV : L->getAllDPValueUsers()) {
         if (Type == DPValue::LocationType::Any || DPV->getType() == Type)
-          DPValues->push_back(DPV);
+          if (EncounteredDPValues.insert(DPV).second)
+            DPValues->push_back(DPV);
       }
     }
   };
diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll
index 47b2ddafcfc650..9417859480e58a 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll
@@ -9,6 +9,11 @@
 ; CHECK-SAME:    !DIExpression(DW_OP_plus_uconst, [[OffsetX:[0-9]*]]))
 ;                                                                   ^ No deref at the end, as this variable ("x") is an array;
 ;                                                                     its value is its address. The entire array is in the frame.
+; CHECK:       call void @llvm.dbg.assign(metadata ptr %[[frame]]
+; CHECK-SAME:    !DIExpression(DW_OP_plus_uconst, [[OffsetX]])
+;; FIXME: Should we be updating the addresses on assigns here as well?
+; CHECK-SAME:    , metadata ptr %[[frame]], !DIExpression())
+
 ; CHECK:       call void @llvm.dbg.value(metadata ptr %[[frame]]
 ; CHECK-SAME:    !DIExpression(DW_OP_plus_uconst, [[OffsetSpill:[0-9]*]], DW_OP_deref))
 ; CHECK:       call void @llvm.dbg.value(metadata ptr %[[frame]]
@@ -78,6 +83,7 @@ init.ready:                                       ; preds = %init.suspend, %coro
   %i.init.ready.inc = add nsw i32 0, 1
   call void @llvm.dbg.value(metadata i32 %i.init.ready.inc, metadata !6, metadata !DIExpression()), !dbg !11
   call void @llvm.dbg.value(metadata ptr %x, metadata !12, metadata !DIExpression()), !dbg !17
+  call void @llvm.dbg.assign(metadata ptr %x, metadata !12, metadata !DIExpression(), metadata !30, metadata ptr %x, metadata !DIExpression()), !dbg !17
   call void @llvm.memset.p0.i64(ptr align 16 %x, i8 0, i64 40, i1 false), !dbg !17
   call void @print(i32 %i.init.ready.inc)
   %ready.again = call zeroext i1 @await_ready()
@@ -250,3 +256,4 @@ attributes #4 = { argmemonly nofree nosync nounwind willreturn writeonly }
 !21 = !DILocation(line: 43, column: 3, scope: !7)
 !22 = !DILocation(line: 43, column: 8, scope: !7)
 !23 = !DILocalVariable(name: "produced", scope: !7, file: !1, line:24, type: !10)
+!30 = distinct !DIAssignID()
\ No newline at end of file

From 0d572c41f941a4c9b3744a3b849ec35cd26bae2b Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Thu, 29 Feb 2024 15:38:54 +0100
Subject: [PATCH 20/55] AMDGPU\GlobalISel: remove
 amdgpu-global-isel-risky-select flag (#83426)

AMDGPUInstructionSelector should no longer attempt to select S1 G_PHIs.
Remove MIR test that attempts to inst-select divergent vcc(S1) G_PHI.
Lane mask merging algorithm for GlobalISel is now responsible for
selecting divergent S1 G_PHIs in AMDGPUGlobalISelDivergenceLowering.
Uniform S1 G_PHIs should be lowered to S32 G_PHIs in reg bank select
pass. In summary S1 G_PHIs should not reach AMDGPUInstructionSelector.
---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 20 ++-----
 .../divergence-temporal-divergent-i1.ll       |  2 +-
 .../divergence-temporal-divergent-reg.ll      |  2 +-
 .../AMDGPU/GlobalISel/inst-select-phi.mir     | 56 +------------------
 4 files changed, 9 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index aacc3590a5dbf9..b2c65e61b0097c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -34,12 +34,6 @@
 using namespace llvm;
 using namespace MIPatternMatch;
 
-static cl::opt<bool> AllowRiskySelect(
-  "amdgpu-global-isel-risky-select",
-  cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
-  cl::init(false),
-  cl::ReallyHidden);
-
 #define GET_GLOBALISEL_IMPL
 #define AMDGPUSubtarget GCNSubtarget
 #include "AMDGPUGenGlobalISel.inc"
@@ -211,14 +205,12 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
   const Register DefReg = I.getOperand(0).getReg();
   const LLT DefTy = MRI->getType(DefReg);
 
-  if (DefTy == LLT::scalar(1)) {
-    if (!AllowRiskySelect) {
-      LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
-      return false;
-    }
-
-    LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
-  }
+  // S1 G_PHIs should not be selected in instruction-select, instead:
+  // - divergent S1 G_PHI should go through lane mask merging algorithm
+  //   and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
+  // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
+  if (DefTy == LLT::scalar(1))
+    return false;
 
   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index 312c6a3822ce4f..1855ede0483def 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 
 define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i1_phi:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
index b21e6a729dbc22..1934958ea8f37c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 
 define void @temporal_divergent_i32(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
index c7d45f062d0d20..4bb9eb807e1568 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -amdgpu-global-isel-risky-select -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
+# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
 
 ---
 name:            g_phi_s32_ss_sbranch
@@ -321,60 +321,6 @@ body:             |
 
 ...
 
----
-name:            g_phi_vcc_s1_sbranch
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-machineFunctionInfo: {}
-body:             |
-  ; GCN-LABEL: name: g_phi_vcc_s1_sbranch
-  ; GCN: bb.0:
-  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr2
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY]], [[S_MOV_B32_]], implicit $exec
-  ; GCN-NEXT:   S_CMP_EQ_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc
-  ; GCN-NEXT:   $scc = COPY [[COPY3]]
-  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
-  ; GCN-NEXT:   S_BRANCH %bb.2
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY1]], [[S_MOV_B32_]], implicit $exec
-  ; GCN-NEXT:   S_BRANCH %bb.2
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[V_CMP_EQ_U32_e64_]], %bb.0, [[V_CMP_EQ_U32_e64_1]], %bb.1
-  ; GCN-NEXT:   S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[PHI]]
-  bb.0:
-    liveins: $vgpr0, $vgpr1, $sgpr2
-
-    %0:vgpr(s32) = COPY $vgpr0
-    %1:vgpr(s32) = COPY $vgpr1
-    %2:sgpr(s32) = COPY $sgpr2
-    %3:sgpr(s32) = G_CONSTANT i32 0
-    %4:vcc(s1) = G_ICMP intpred(eq), %0, %3
-    %5:sgpr(s32) = G_ICMP intpred(eq), %2(s32), %3
-    G_BRCOND %5, %bb.1
-    G_BR %bb.2
-
-  bb.1:
-    %6:vcc(s1) = G_ICMP intpred(eq), %1, %3
-    G_BR %bb.2
-
-  bb.2:
-    %7:vcc(s1) = G_PHI %4, %bb.0, %6, %bb.1
-    S_SETPC_B64 undef $sgpr30_sgpr31, implicit %7
-
-...
-
 ---
 name:            phi_s32_ss_sbranch
 legalized:       true

From 85dc3dfb1fa2c6720bdfbaaab012ebd96cbe3a58 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 29 Feb 2024 14:46:52 +0000
Subject: [PATCH 21/55] [DebugInfo][RemoveDIs] Fix incorrect test expect

Fixes: aadd7650447b
The above commit landed with an incorrect test expect, missing a `metadata`
prefix. This patch adds the expected prefix to the test.
---
 llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll
index 9417859480e58a..dd9310fe34f341 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll
@@ -12,7 +12,7 @@
 ; CHECK:       call void @llvm.dbg.assign(metadata ptr %[[frame]]
 ; CHECK-SAME:    !DIExpression(DW_OP_plus_uconst, [[OffsetX]])
 ;; FIXME: Should we be updating the addresses on assigns here as well?
-; CHECK-SAME:    , metadata ptr %[[frame]], !DIExpression())
+; CHECK-SAME:    , metadata ptr %[[frame]], metadata !DIExpression())
 
 ; CHECK:       call void @llvm.dbg.value(metadata ptr %[[frame]]
 ; CHECK-SAME:    !DIExpression(DW_OP_plus_uconst, [[OffsetSpill:[0-9]*]], DW_OP_deref))

From a872a35251b833aaddec2172710ff234236afbd8 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 29 Feb 2024 14:31:58 +0000
Subject: [PATCH 22/55] [NFC][RemoveDIs] Add bodies for inst-constructors
 taking iterators

In a previous commit I added declarations for all these functions, but
forgot to add bodies for them (as nothing uses them yet). These
iterator-taking constructors are necessary for the future where we only
use iterators for insertion, preserving some debug-info properties.

Also adds two extra declarations I missed in 76dd4bc036f
---
 llvm/include/llvm/IR/InstrTypes.h |   9 +++
 llvm/lib/IR/Instructions.cpp      | 129 ++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)

diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index a124fbb88e3bcd..0e81d3b391a083 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1536,10 +1536,19 @@ class CallBase : public Instruction {
                                     OperandBundleDef OB,
                                     Instruction *InsertPt = nullptr);
 
+  /// Create a clone of \p CB with operand bundle \p OB added.
+  static CallBase *addOperandBundle(CallBase *CB, uint32_t ID,
+                                    OperandBundleDef OB,
+                                    BasicBlock::iterator InsertPt);
+
   /// Create a clone of \p CB with operand bundle \p ID removed.
   static CallBase *removeOperandBundle(CallBase *CB, uint32_t ID,
                                        Instruction *InsertPt = nullptr);
 
+  /// Create a clone of \p CB with operand bundle \p ID removed.
+  static CallBase *removeOperandBundle(CallBase *CB, uint32_t ID,
+                                       BasicBlock::iterator InsertPt);
+
   static bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Call ||
            I->getOpcode() == Instruction::Invoke ||
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 976015656a5f96..42cdcad78228f6 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -303,6 +303,20 @@ void LandingPadInst::addClause(Constant *Val) {
 //                        CallBase Implementation
 //===----------------------------------------------------------------------===//
 
+CallBase *CallBase::Create(CallBase *CB, ArrayRef<OperandBundleDef> Bundles,
+                           BasicBlock::iterator InsertPt) {
+  switch (CB->getOpcode()) {
+  case Instruction::Call:
+    return CallInst::Create(cast<CallInst>(CB), Bundles, InsertPt);
+  case Instruction::Invoke:
+    return InvokeInst::Create(cast<InvokeInst>(CB), Bundles, InsertPt);
+  case Instruction::CallBr:
+    return CallBrInst::Create(cast<CallBrInst>(CB), Bundles, InsertPt);
+  default:
+    llvm_unreachable("Unknown CallBase sub-class!");
+  }
+}
+
 CallBase *CallBase::Create(CallBase *CB, ArrayRef<OperandBundleDef> Bundles,
                            Instruction *InsertPt) {
   switch (CB->getOpcode()) {
@@ -557,6 +571,18 @@ CallBase::BundleOpInfo &CallBase::getBundleOpInfoForOperand(unsigned OpIdx) {
   return *Current;
 }
 
+CallBase *CallBase::addOperandBundle(CallBase *CB, uint32_t ID,
+                                     OperandBundleDef OB,
+                                     BasicBlock::iterator InsertPt) {
+  if (CB->getOperandBundle(ID))
+    return CB;
+
+  SmallVector<OperandBundleDef, 1> Bundles;
+  CB->getOperandBundlesAsDefs(Bundles);
+  Bundles.push_back(OB);
+  return Create(CB, Bundles, InsertPt);
+}
+
 CallBase *CallBase::addOperandBundle(CallBase *CB, uint32_t ID,
                                      OperandBundleDef OB,
                                      Instruction *InsertPt) {
@@ -569,6 +595,23 @@ CallBase *CallBase::addOperandBundle(CallBase *CB, uint32_t ID,
   return Create(CB, Bundles, InsertPt);
 }
 
+CallBase *CallBase::removeOperandBundle(CallBase *CB, uint32_t ID,
+                                        BasicBlock::iterator InsertPt) {
+  SmallVector<OperandBundleDef, 1> Bundles;
+  bool CreateNew = false;
+
+  for (unsigned I = 0, E = CB->getNumOperandBundles(); I != E; ++I) {
+    auto Bundle = CB->getOperandBundleAt(I);
+    if (Bundle.getTagID() == ID) {
+      CreateNew = true;
+      continue;
+    }
+    Bundles.emplace_back(Bundle);
+  }
+
+  return CreateNew ? Create(CB, Bundles, InsertPt) : CB;
+}
+
 CallBase *CallBase::removeOperandBundle(CallBase *CB, uint32_t ID,
                                         Instruction *InsertPt) {
   SmallVector<OperandBundleDef, 1> Bundles;
@@ -716,6 +759,13 @@ void CallInst::init(FunctionType *FTy, Value *Func, const Twine &NameStr) {
   setName(NameStr);
 }
 
+CallInst::CallInst(FunctionType *Ty, Value *Func, const Twine &Name,
+                   BasicBlock::iterator InsertBefore)
+    : CallBase(Ty->getReturnType(), Instruction::Call,
+               OperandTraits<CallBase>::op_end(this) - 1, 1, InsertBefore) {
+  init(Ty, Func, Name);
+}
+
 CallInst::CallInst(FunctionType *Ty, Value *Func, const Twine &Name,
                    Instruction *InsertBefore)
     : CallBase(Ty->getReturnType(), Instruction::Call,
@@ -880,6 +930,20 @@ InvokeInst::InvokeInst(const InvokeInst &II)
   SubclassOptionalData = II.SubclassOptionalData;
 }
 
+InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
+                               BasicBlock::iterator InsertPt) {
+  std::vector<Value *> Args(II->arg_begin(), II->arg_end());
+
+  auto *NewII = InvokeInst::Create(
+      II->getFunctionType(), II->getCalledOperand(), II->getNormalDest(),
+      II->getUnwindDest(), Args, OpB, II->getName(), InsertPt);
+  NewII->setCallingConv(II->getCallingConv());
+  NewII->SubclassOptionalData = II->SubclassOptionalData;
+  NewII->setAttributes(II->getAttributes());
+  NewII->setDebugLoc(II->getDebugLoc());
+  return NewII;
+}
+
 InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
                                Instruction *InsertPt) {
   std::vector<Value *> Args(II->arg_begin(), II->arg_end());
@@ -953,6 +1017,21 @@ CallBrInst::CallBrInst(const CallBrInst &CBI)
   NumIndirectDests = CBI.NumIndirectDests;
 }
 
+CallBrInst *CallBrInst::Create(CallBrInst *CBI, ArrayRef<OperandBundleDef> OpB,
+                               BasicBlock::iterator InsertPt) {
+  std::vector<Value *> Args(CBI->arg_begin(), CBI->arg_end());
+
+  auto *NewCBI = CallBrInst::Create(
+      CBI->getFunctionType(), CBI->getCalledOperand(), CBI->getDefaultDest(),
+      CBI->getIndirectDests(), Args, OpB, CBI->getName(), InsertPt);
+  NewCBI->setCallingConv(CBI->getCallingConv());
+  NewCBI->SubclassOptionalData = CBI->SubclassOptionalData;
+  NewCBI->setAttributes(CBI->getAttributes());
+  NewCBI->setDebugLoc(CBI->getDebugLoc());
+  NewCBI->NumIndirectDests = CBI->NumIndirectDests;
+  return NewCBI;
+}
+
 CallBrInst *CallBrInst::Create(CallBrInst *CBI, ArrayRef<OperandBundleDef> OpB,
                                Instruction *InsertPt) {
   std::vector<Value *> Args(CBI->arg_begin(), CBI->arg_end());
@@ -1135,6 +1214,18 @@ CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
 //                       CatchSwitchInst Implementation
 //===----------------------------------------------------------------------===//
 
+CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
+                                 unsigned NumReservedValues,
+                                 const Twine &NameStr,
+                                 BasicBlock::iterator InsertBefore)
+    : Instruction(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
+                  InsertBefore) {
+  if (UnwindDest)
+    ++NumReservedValues;
+  init(ParentPad, UnwindDest, NumReservedValues + 1);
+  setName(NameStr);
+}
+
 CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
                                  unsigned NumReservedValues,
                                  const Twine &NameStr,
@@ -3222,6 +3313,14 @@ void BinaryOperator::AssertOK() {
 #endif
 }
 
+BinaryOperator *BinaryOperator::Create(BinaryOps Op, Value *S1, Value *S2,
+                                       const Twine &Name,
+                                       BasicBlock::iterator InsertBefore) {
+  assert(S1->getType() == S2->getType() &&
+         "Cannot create binary operator with two operands of differing type!");
+  return new BinaryOperator(Op, S1, S2, S1->getType(), Name, InsertBefore);
+}
+
 BinaryOperator *BinaryOperator::Create(BinaryOps Op, Value *S1, Value *S2,
                                        const Twine &Name,
                                        Instruction *InsertBefore) {
@@ -3277,6 +3376,13 @@ BinaryOperator *BinaryOperator::CreateNUWNeg(Value *Op, const Twine &Name,
   return BinaryOperator::CreateNUWSub(Zero, Op, Name, InsertAtEnd);
 }
 
+BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
+                                          BasicBlock::iterator InsertBefore) {
+  Constant *C = Constant::getAllOnesValue(Op->getType());
+  return new BinaryOperator(Instruction::Xor, Op, C,
+                            Op->getType(), Name, InsertBefore);
+}
+
 BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
                                           Instruction *InsertBefore) {
   Constant *C = Constant::getAllOnesValue(Op->getType());
@@ -3821,6 +3927,17 @@ CastInst *CastInst::CreatePointerBitCastOrAddrSpaceCast(
   return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
 }
 
+CastInst *CastInst::CreateBitOrPointerCast(Value *S, Type *Ty,
+                                           const Twine &Name,
+                                           BasicBlock::iterator InsertBefore) {
+  if (S->getType()->isPointerTy() && Ty->isIntegerTy())
+    return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore);
+  if (S->getType()->isIntegerTy() && Ty->isPointerTy())
+    return Create(Instruction::IntToPtr, S, Ty, Name, InsertBefore);
+
+  return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+}
+
 CastInst *CastInst::CreateBitOrPointerCast(Value *S, Type *Ty,
                                            const Twine &Name,
                                            Instruction *InsertBefore) {
@@ -4455,6 +4572,18 @@ CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
   setName(Name);
 }
 
+CmpInst *
+CmpInst::Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2,
+                const Twine &Name, BasicBlock::iterator InsertBefore) {
+  if (Op == Instruction::ICmp) {
+    return new ICmpInst(InsertBefore, CmpInst::Predicate(predicate),
+                        S1, S2, Name);
+  }
+
+  return new FCmpInst(InsertBefore, CmpInst::Predicate(predicate),
+                      S1, S2, Name);
+}
+
 CmpInst *
 CmpInst::Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2,
                 const Twine &Name, Instruction *InsertBefore) {

From 1e6627ecef42fa8e36dae71589fc17d3adbd18aa Mon Sep 17 00:00:00 2001
From: RicoAfoat <51285519+RicoAfoat@users.noreply.github.com>
Date: Thu, 29 Feb 2024 22:55:51 +0800
Subject: [PATCH 23/55] [X86] matchAddressRecursively - ensure dead nodes are
 replaced before matching the index register (#82881)

Fixes #82431 - see #82431 for more information.
---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp   | 10 +++++----
 llvm/test/CodeGen/X86/inline-asm-memop.ll | 27 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/inline-asm-memop.ll

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index c8f80ced354538..5cbd9ab4dc2d6c 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2732,13 +2732,15 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       insertDAGNode(*CurDAG, N, Zext);
       SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
       insertDAGNode(*CurDAG, N, NewShl);
+      CurDAG->ReplaceAllUsesWith(N, NewShl);
+      CurDAG->RemoveDeadNode(N.getNode());
 
       // Convert the shift to scale factor.
       AM.Scale = 1 << ShAmtV;
-      AM.IndexReg = Zext;
-
-      CurDAG->ReplaceAllUsesWith(N, NewShl);
-      CurDAG->RemoveDeadNode(N.getNode());
+      // If matchIndexRecursively is not called here,
+      // Zext may be replaced by other nodes but later used to call a builder
+      // method
+      AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
       return false;
     }
 
diff --git a/llvm/test/CodeGen/X86/inline-asm-memop.ll b/llvm/test/CodeGen/X86/inline-asm-memop.ll
new file mode 100644
index 00000000000000..83442498076102
--- /dev/null
+++ b/llvm/test/CodeGen/X86/inline-asm-memop.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 < %s | FileCheck %s
+
+; A bug in X86DAGToDAGISel::matchAddressRecursively create a zext SDValue which
+; is quickly replaced by other SDValue but already pushed into vector for later
+; calling for SelectionDAGISel::Select_INLINEASM getNode builder, see issue
+; 82431 for more infomation.
+
+define void @PR82431(i8 %call, ptr %b) {
+; CHECK-LABEL: PR82431:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movb %dil, %al
+; CHECK-NEXT:    addb $1, %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: def $rax killed $eax
+; CHECK-NEXT:    shlq $3, %rax
+; CHECK-NEXT:    addq %rax, %rsi
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    retq
+entry:
+  %narrow = add nuw i8 %call, 1
+  %idxprom = zext i8 %narrow to i64
+  %arrayidx = getelementptr [1 x i64], ptr %b, i64 0, i64 %idxprom
+  tail call void asm "", "=*m,*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i64) %arrayidx, ptr elementtype(i64) %arrayidx)
+  ret void
+}

From 4f132dca711f4b425f9d370f5d59efb766b8bffa Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Thu, 29 Feb 2024 09:57:15 -0500
Subject: [PATCH 24/55] [RISCV] Enable PostRAScheduler for SiFive7 (#83166)

Based on numbers collected in our downstream toolchain.
---
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td    |  1 +
 llvm/test/CodeGen/RISCV/machine-combiner.ll   |  8 +--
 .../CodeGen/RISCV/short-forward-branch-opt.ll | 64 +++++++++----------
 3 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 040cec42674000..0430d603620b6a 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -198,6 +198,7 @@ def SiFive7Model : SchedMachineModel {
   let LoadLatency = 3;
   let MispredictPenalty = 3;
   let CompleteModel = 0;
+  let PostRAScheduler = true;
   let EnableIntervals = true;
   let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
                              HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne,
diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll
index 7c1792e2f101f5..cfdefec04600c8 100644
--- a/llvm/test/CodeGen/RISCV/machine-combiner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll
@@ -1096,10 +1096,10 @@ declare double @llvm.maxnum.f64(double, double)
 define double @test_fmadd_strategy(double %a0, double %a1, double %a2, double %a3, i64 %flag) {
 ; CHECK_LOCAL-LABEL: test_fmadd_strategy:
 ; CHECK_LOCAL:       # %bb.0: # %entry
-; CHECK_LOCAL-NEXT:    fmv.d fa5, fa0
 ; CHECK_LOCAL-NEXT:    fsub.d fa4, fa0, fa1
-; CHECK_LOCAL-NEXT:    fmul.d fa0, fa4, fa2
 ; CHECK_LOCAL-NEXT:    andi a0, a0, 1
+; CHECK_LOCAL-NEXT:    fmv.d fa5, fa0
+; CHECK_LOCAL-NEXT:    fmul.d fa0, fa4, fa2
 ; CHECK_LOCAL-NEXT:    beqz a0, .LBB76_2
 ; CHECK_LOCAL-NEXT:  # %bb.1: # %entry
 ; CHECK_LOCAL-NEXT:    fmul.d fa4, fa5, fa1
@@ -1110,10 +1110,10 @@ define double @test_fmadd_strategy(double %a0, double %a1, double %a2, double %a
 ;
 ; CHECK_GLOBAL-LABEL: test_fmadd_strategy:
 ; CHECK_GLOBAL:       # %bb.0: # %entry
-; CHECK_GLOBAL-NEXT:    fmv.d fa5, fa0
 ; CHECK_GLOBAL-NEXT:    fsub.d fa4, fa0, fa1
-; CHECK_GLOBAL-NEXT:    fmul.d fa0, fa4, fa2
 ; CHECK_GLOBAL-NEXT:    andi a0, a0, 1
+; CHECK_GLOBAL-NEXT:    fmv.d fa5, fa0
+; CHECK_GLOBAL-NEXT:    fmul.d fa0, fa4, fa2
 ; CHECK_GLOBAL-NEXT:    beqz a0, .LBB76_2
 ; CHECK_GLOBAL-NEXT:  # %bb.1: # %entry
 ; CHECK_GLOBAL-NEXT:    fmul.d fa5, fa5, fa1
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
index 87406f22d169d1..c0c11fefafb555 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
@@ -813,24 +813,24 @@ define i64 @select_sll(i64 %A, i64 %B, i64 %C, i1 zeroext %cond) {
 ; RV32SFB-NEXT:    not a7, a2
 ; RV32SFB-NEXT:    srli a0, a0, 1
 ; RV32SFB-NEXT:    sll t0, a1, a2
-; RV32SFB-NEXT:    srl a0, a0, a7
 ; RV32SFB-NEXT:    addi a2, a2, -32
+; RV32SFB-NEXT:    srl a0, a0, a7
 ; RV32SFB-NEXT:    mv a1, a3
-; RV32SFB-NEXT:    bgez a2, .LBB20_2
+; RV32SFB-NEXT:    bltz a2, .LBB20_2
 ; RV32SFB-NEXT:  # %bb.1: # %entry
-; RV32SFB-NEXT:    or a1, t0, a0
+; RV32SFB-NEXT:    li a3, 0
 ; RV32SFB-NEXT:  .LBB20_2: # %entry
-; RV32SFB-NEXT:    bltz a2, .LBB20_4
+; RV32SFB-NEXT:    bgez a2, .LBB20_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
-; RV32SFB-NEXT:    li a3, 0
+; RV32SFB-NEXT:    or a1, t0, a0
 ; RV32SFB-NEXT:  .LBB20_4: # %entry
 ; RV32SFB-NEXT:    beqz a6, .LBB20_6
 ; RV32SFB-NEXT:  # %bb.5: # %entry
-; RV32SFB-NEXT:    mv a1, a5
+; RV32SFB-NEXT:    mv a3, a4
 ; RV32SFB-NEXT:  .LBB20_6: # %entry
 ; RV32SFB-NEXT:    beqz a6, .LBB20_8
 ; RV32SFB-NEXT:  # %bb.7: # %entry
-; RV32SFB-NEXT:    mv a3, a4
+; RV32SFB-NEXT:    mv a1, a5
 ; RV32SFB-NEXT:  .LBB20_8: # %entry
 ; RV32SFB-NEXT:    mv a0, a3
 ; RV32SFB-NEXT:    ret
@@ -874,24 +874,24 @@ define i64 @select_srl(i64 %A, i64 %B, i64 %C, i1 zeroext %cond) {
 ; RV32SFB-NEXT:    not a7, a2
 ; RV32SFB-NEXT:    slli a1, a1, 1
 ; RV32SFB-NEXT:    srl t0, a0, a2
-; RV32SFB-NEXT:    sll a1, a1, a7
 ; RV32SFB-NEXT:    addi a2, a2, -32
+; RV32SFB-NEXT:    sll a1, a1, a7
 ; RV32SFB-NEXT:    mv a0, a3
-; RV32SFB-NEXT:    bgez a2, .LBB21_2
+; RV32SFB-NEXT:    bltz a2, .LBB21_2
 ; RV32SFB-NEXT:  # %bb.1: # %entry
-; RV32SFB-NEXT:    or a0, t0, a1
+; RV32SFB-NEXT:    li a3, 0
 ; RV32SFB-NEXT:  .LBB21_2: # %entry
-; RV32SFB-NEXT:    bltz a2, .LBB21_4
+; RV32SFB-NEXT:    bgez a2, .LBB21_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
-; RV32SFB-NEXT:    li a3, 0
+; RV32SFB-NEXT:    or a0, t0, a1
 ; RV32SFB-NEXT:  .LBB21_4: # %entry
 ; RV32SFB-NEXT:    beqz a6, .LBB21_6
 ; RV32SFB-NEXT:  # %bb.5: # %entry
-; RV32SFB-NEXT:    mv a0, a4
+; RV32SFB-NEXT:    mv a3, a5
 ; RV32SFB-NEXT:  .LBB21_6: # %entry
 ; RV32SFB-NEXT:    beqz a6, .LBB21_8
 ; RV32SFB-NEXT:  # %bb.7: # %entry
-; RV32SFB-NEXT:    mv a3, a5
+; RV32SFB-NEXT:    mv a0, a4
 ; RV32SFB-NEXT:  .LBB21_8: # %entry
 ; RV32SFB-NEXT:    mv a1, a3
 ; RV32SFB-NEXT:    ret
@@ -935,24 +935,24 @@ define i64 @select_sra(i64 %A, i64 %B, i64 %C, i1 zeroext %cond) {
 ; RV32SFB-NEXT:    not a7, a2
 ; RV32SFB-NEXT:    slli t0, a1, 1
 ; RV32SFB-NEXT:    srl t1, a0, a2
-; RV32SFB-NEXT:    sll a7, t0, a7
 ; RV32SFB-NEXT:    addi a2, a2, -32
+; RV32SFB-NEXT:    sll a7, t0, a7
 ; RV32SFB-NEXT:    mv a0, a3
-; RV32SFB-NEXT:    bgez a2, .LBB22_2
+; RV32SFB-NEXT:    bltz a2, .LBB22_2
 ; RV32SFB-NEXT:  # %bb.1: # %entry
-; RV32SFB-NEXT:    or a0, t1, a7
+; RV32SFB-NEXT:    srai a3, a1, 31
 ; RV32SFB-NEXT:  .LBB22_2: # %entry
-; RV32SFB-NEXT:    bltz a2, .LBB22_4
+; RV32SFB-NEXT:    bgez a2, .LBB22_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
-; RV32SFB-NEXT:    srai a3, a1, 31
+; RV32SFB-NEXT:    or a0, t1, a7
 ; RV32SFB-NEXT:  .LBB22_4: # %entry
 ; RV32SFB-NEXT:    beqz a6, .LBB22_6
 ; RV32SFB-NEXT:  # %bb.5: # %entry
-; RV32SFB-NEXT:    mv a0, a4
+; RV32SFB-NEXT:    mv a3, a5
 ; RV32SFB-NEXT:  .LBB22_6: # %entry
 ; RV32SFB-NEXT:    beqz a6, .LBB22_8
 ; RV32SFB-NEXT:  # %bb.7: # %entry
-; RV32SFB-NEXT:    mv a3, a5
+; RV32SFB-NEXT:    mv a0, a4
 ; RV32SFB-NEXT:  .LBB22_8: # %entry
 ; RV32SFB-NEXT:    mv a1, a3
 ; RV32SFB-NEXT:    ret
@@ -1088,11 +1088,11 @@ define i64 @select_andi(i64 %A, i64 %C, i1 zeroext %cond) {
 ; RV32SFB-NEXT:  # %bb.1: # %entry
 ; RV32SFB-NEXT:    andi a2, a0, 567
 ; RV32SFB-NEXT:  .LBB25_2: # %entry
+; RV32SFB-NEXT:    mv a0, a2
 ; RV32SFB-NEXT:    bnez a4, .LBB25_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
 ; RV32SFB-NEXT:    li a1, 0
 ; RV32SFB-NEXT:  .LBB25_4: # %entry
-; RV32SFB-NEXT:    mv a0, a2
 ; RV32SFB-NEXT:    ret
 entry:
  %0 = and i64 %A, 567
@@ -1130,13 +1130,13 @@ define i64 @select_ori(i64 %A, i64 %C, i1 zeroext %cond) {
 ;
 ; RV32SFB-LABEL: select_ori:
 ; RV32SFB:       # %bb.0: # %entry
-; RV32SFB-NEXT:    beqz a4, .LBB26_2
+; RV32SFB-NEXT:    bnez a4, .LBB26_2
 ; RV32SFB-NEXT:  # %bb.1: # %entry
-; RV32SFB-NEXT:    mv a1, a3
+; RV32SFB-NEXT:    ori a2, a0, 890
 ; RV32SFB-NEXT:  .LBB26_2: # %entry
-; RV32SFB-NEXT:    bnez a4, .LBB26_4
+; RV32SFB-NEXT:    beqz a4, .LBB26_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
-; RV32SFB-NEXT:    ori a2, a0, 890
+; RV32SFB-NEXT:    mv a1, a3
 ; RV32SFB-NEXT:  .LBB26_4: # %entry
 ; RV32SFB-NEXT:    mv a0, a2
 ; RV32SFB-NEXT:    ret
@@ -1176,13 +1176,13 @@ define i64 @select_xori(i64 %A, i64 %C, i1 zeroext %cond) {
 ;
 ; RV32SFB-LABEL: select_xori:
 ; RV32SFB:       # %bb.0: # %entry
-; RV32SFB-NEXT:    beqz a4, .LBB27_2
+; RV32SFB-NEXT:    bnez a4, .LBB27_2
 ; RV32SFB-NEXT:  # %bb.1: # %entry
-; RV32SFB-NEXT:    mv a1, a3
+; RV32SFB-NEXT:    xori a2, a0, 321
 ; RV32SFB-NEXT:  .LBB27_2: # %entry
-; RV32SFB-NEXT:    bnez a4, .LBB27_4
+; RV32SFB-NEXT:    beqz a4, .LBB27_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
-; RV32SFB-NEXT:    xori a2, a0, 321
+; RV32SFB-NEXT:    mv a1, a3
 ; RV32SFB-NEXT:  .LBB27_4: # %entry
 ; RV32SFB-NEXT:    mv a0, a2
 ; RV32SFB-NEXT:    ret
@@ -1272,11 +1272,11 @@ define i64 @select_srli(i64 %A, i64 %C, i1 zeroext %cond) {
 ; RV32SFB-NEXT:    mv a0, a2
 ; RV32SFB-NEXT:    bnez a4, .LBB29_2
 ; RV32SFB-NEXT:  # %bb.1: # %entry
-; RV32SFB-NEXT:    srli a0, a1, 3
+; RV32SFB-NEXT:    li a3, 0
 ; RV32SFB-NEXT:  .LBB29_2: # %entry
 ; RV32SFB-NEXT:    bnez a4, .LBB29_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
-; RV32SFB-NEXT:    li a3, 0
+; RV32SFB-NEXT:    srli a0, a1, 3
 ; RV32SFB-NEXT:  .LBB29_4: # %entry
 ; RV32SFB-NEXT:    mv a1, a3
 ; RV32SFB-NEXT:    ret

From 99824cf7967922bdd9ac895c949f330bb8d6b85a Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 29 Feb 2024 14:32:42 +0000
Subject: [PATCH 25/55] [lldb][test] Use pexpect spawn instead of spawnu

This is marked deprecated from at least 4.6 onward:
Deprecated: pass encoding to spawn() instead.
---
 lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py | 5 +++--
 lldb/test/API/terminal/TestSTTYBeforeAndAfter.py    | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py b/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py
index d7560156e0571a..15d9feb543895a 100644
--- a/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py
+++ b/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py
@@ -56,8 +56,9 @@ def run_lldb_to_breakpoint(self, exe, source_file, line, settings_commands=None)
         # So that the child gets torn down after the test.
         import pexpect
 
-        self.child = pexpect.spawnu(
-            "%s %s %s" % (lldbtest_config.lldbExec, self.lldbOption, exe)
+        self.child = pexpect.spawn(
+            "%s %s %s" % (lldbtest_config.lldbExec, self.lldbOption, exe),
+            encoding="utf-8",
         )
         child = self.child
 
diff --git a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
index e9b5940ff1adaf..31b960859fa2e5 100644
--- a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
+++ b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
@@ -37,7 +37,7 @@ def test_stty_dash_a_before_and_afetr_invoking_lldb_command(self):
         lldb_prompt = "(lldb) "
 
         # So that the child gets torn down after the test.
-        self.child = pexpect.spawnu("expect")
+        self.child = pexpect.spawn("expect", encoding="utf-8")
         child = self.child
 
         child.expect(expect_prompt)

From e60ebbd0001f2e66cb9f76874ddd69290e2086c1 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com>
Date: Thu, 29 Feb 2024 10:01:36 -0500
Subject: [PATCH 26/55] [HLSL] implementation of lerp intrinsic (#83077)

This is the start of implementing the lerp intrinsic
https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-lerp
Builtins.td - defines the builtin
hlsl_intrinsics.h - defines the lerp api
DiagnosticSemaKinds.td - needed a new error to be inclusive for more
than two operands.
CGBuiltin.cpp - add the lerp intrinsic lowering
SemaChecking.cpp - type checks for lerp builtin
IntrinsicsDirectX.td - define the lerp intrinsic

this change implements the first half of #70102

Co-authored-by: Xiang Li <python3kgae@outlook.com>
---
 clang/include/clang/Basic/Builtins.td         |   6 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   5 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  40 +++++++
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  36 ++++++
 clang/lib/Sema/SemaChecking.cpp               |  69 +++++++-----
 .../CodeGenHLSL/builtins/lerp-builtin.hlsl    |  37 ++++++
 clang/test/CodeGenHLSL/builtins/lerp.hlsl     | 105 ++++++++++++++++++
 clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl  |  28 ++---
 clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl |  91 +++++++++++++++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |   5 +
 10 files changed, 381 insertions(+), 41 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
 create mode 100644 clang/test/CodeGenHLSL/builtins/lerp.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 2fbc56d49a59a1..36151b49d9363d 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4536,6 +4536,12 @@ def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLLerp : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_lerp"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index ff88c4f293abfd..4ef3ac8c96cd26 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10266,6 +10266,11 @@ def err_block_on_vm : Error<
 def err_sizeless_nonlocal : Error<
   "non-local variable with sizeless type %0">;
 
+def err_vec_builtin_non_vector_all : Error<
+ "all arguments to %0 must be vectors">;
+def err_vec_builtin_incompatible_vector_all : Error<
+  "all arguments to %0 must have vectors of the same type">;
+  
 def err_vec_builtin_non_vector : Error<
  "first two arguments to %0 must be vectors">;
 def err_vec_builtin_incompatible_vector : Error<
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2d16e7cdc06053..74ca96117793c4 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18007,6 +18007,46 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType*/ T0->getScalarType(), Intrinsic::dx_dot,
         ArrayRef<Value *>{Op0, Op1}, nullptr, "dx.dot");
   } break;
+  case Builtin::BI__builtin_hlsl_lerp: {
+    Value *X = EmitScalarExpr(E->getArg(0));
+    Value *Y = EmitScalarExpr(E->getArg(1));
+    Value *S = EmitScalarExpr(E->getArg(2));
+    llvm::Type *Xty = X->getType();
+    llvm::Type *Yty = Y->getType();
+    llvm::Type *Sty = S->getType();
+    if (!Xty->isVectorTy() && !Yty->isVectorTy() && !Sty->isVectorTy()) {
+      if (Xty->isFloatingPointTy()) {
+        auto V = Builder.CreateFSub(Y, X);
+        V = Builder.CreateFMul(S, V);
+        return Builder.CreateFAdd(X, V, "dx.lerp");
+      }
+      // DXC does this via casting to float should we do the same thing?
+      if (Xty->isIntegerTy()) {
+        auto V = Builder.CreateSub(Y, X);
+        V = Builder.CreateMul(S, V);
+        return Builder.CreateAdd(X, V, "dx.lerp");
+      }
+      // Bools should have been promoted
+      llvm_unreachable("Scalar Lerp is only supported on ints and floats.");
+    }
+    // A VectorSplat should have happened
+    assert(Xty->isVectorTy() && Yty->isVectorTy() && Sty->isVectorTy() &&
+           "Lerp of vector and scalar is not supported.");
+
+    [[maybe_unused]] auto *XVecTy =
+        E->getArg(0)->getType()->getAs<VectorType>();
+    [[maybe_unused]] auto *YVecTy =
+        E->getArg(1)->getType()->getAs<VectorType>();
+    [[maybe_unused]] auto *SVecTy =
+        E->getArg(2)->getType()->getAs<VectorType>();
+    // A HLSLVectorTruncation should have happend
+    assert(XVecTy->getNumElements() == YVecTy->getNumElements() &&
+           SVecTy->getNumElements() &&
+           "Lerp requires vectors to be of the same size.");
+    return Builder.CreateIntrinsic(
+        /*ReturnType*/ Xty, Intrinsic::dx_lerp, ArrayRef<Value *>{X, Y, S},
+        nullptr, "dx.lerp");
+  }
   }
   return nullptr;
 }
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 08e5d981a4a4ca..1314bdefa37e7b 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -317,6 +317,42 @@ double3 floor(double3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_floor)
 double4 floor(double4);
 
+//===----------------------------------------------------------------------===//
+// lerp builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T lerp(T x, T y, T s)
+/// \brief Returns the linear interpolation of x to y by s.
+/// \param x [in] The first-floating point value.
+/// \param y [in] The second-floating point value.
+/// \param s [in] A value that linearly interpolates between the x parameter and
+/// the y parameter.
+///
+/// Linear interpolation is based on the following formula: x*(1-s) + y*s which
+/// can equivalently be written as x + s(y-x).
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_lerp)
+half lerp(half, half, half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_lerp)
+half2 lerp(half2, half2, half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_lerp)
+half3 lerp(half3, half3, half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_lerp)
+half4 lerp(half4, half4, half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_lerp)
+float lerp(float, float, float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_lerp)
+float2 lerp(float2, float2, float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_lerp)
+float3 lerp(float3, float3, float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_lerp)
+float4 lerp(float4, float4, float4);
+
 //===----------------------------------------------------------------------===//
 // log builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 35d453e013e84b..016e9830662042 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5197,43 +5197,49 @@ bool Sema::CheckPPCMMAType(QualType Type, SourceLocation TypeLoc) {
 bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
   assert(TheCall->getNumArgs() > 1);
   ExprResult A = TheCall->getArg(0);
-  ExprResult B = TheCall->getArg(1);
+
   QualType ArgTyA = A.get()->getType();
-  QualType ArgTyB = B.get()->getType();
+
   auto *VecTyA = ArgTyA->getAs<VectorType>();
-  auto *VecTyB = ArgTyB->getAs<VectorType>();
   SourceLocation BuiltinLoc = TheCall->getBeginLoc();
-  if (VecTyA == nullptr && VecTyB == nullptr)
-    return false;
 
-  if (VecTyA && VecTyB) {
-    bool retValue = false;
-    if (VecTyA->getElementType() != VecTyB->getElementType()) {
-      // Note: type promotion is intended to be handeled via the intrinsics
-      //  and not the builtin itself.
-      S->Diag(TheCall->getBeginLoc(), diag::err_vec_builtin_incompatible_vector)
-          << TheCall->getDirectCallee()
-          << SourceRange(A.get()->getBeginLoc(), B.get()->getEndLoc());
-      retValue = true;
-    }
-    if (VecTyA->getNumElements() != VecTyB->getNumElements()) {
-      // if we get here a HLSLVectorTruncation is needed.
-      S->Diag(BuiltinLoc, diag::err_vec_builtin_incompatible_vector)
-          << TheCall->getDirectCallee()
-          << SourceRange(TheCall->getArg(0)->getBeginLoc(),
-                         TheCall->getArg(1)->getEndLoc());
-      retValue = true;
-    }
+  for (unsigned i = 1; i < TheCall->getNumArgs(); ++i) {
+    ExprResult B = TheCall->getArg(i);
+    QualType ArgTyB = B.get()->getType();
+    auto *VecTyB = ArgTyB->getAs<VectorType>();
+    if (VecTyA == nullptr && VecTyB == nullptr)
+      return false;
+
+    if (VecTyA && VecTyB) {
+      bool retValue = false;
+      if (VecTyA->getElementType() != VecTyB->getElementType()) {
+        // Note: type promotion is intended to be handeled via the intrinsics
+        //  and not the builtin itself.
+        S->Diag(TheCall->getBeginLoc(),
+                diag::err_vec_builtin_incompatible_vector_all)
+            << TheCall->getDirectCallee()
+            << SourceRange(A.get()->getBeginLoc(), B.get()->getEndLoc());
+        retValue = true;
+      }
+      if (VecTyA->getNumElements() != VecTyB->getNumElements()) {
+        // if we get here a HLSLVectorTruncation is needed.
+        S->Diag(BuiltinLoc, diag::err_vec_builtin_incompatible_vector_all)
+            << TheCall->getDirectCallee()
+            << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                           TheCall->getArg(1)->getEndLoc());
+        retValue = true;
+      }
 
-    if (retValue)
-      TheCall->setType(VecTyA->getElementType());
+      if (!retValue)
+        TheCall->setType(VecTyA->getElementType());
 
-    return retValue;
+      return retValue;
+    }
   }
 
   // Note: if we get here one of the args is a scalar which
   // requires a VectorSplat on Arg0 or Arg1
-  S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
+  S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector_all)
       << TheCall->getDirectCallee()
       << SourceRange(TheCall->getArg(0)->getBeginLoc(),
                      TheCall->getArg(1)->getEndLoc());
@@ -5253,6 +5259,15 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
+  case Builtin::BI__builtin_hlsl_lerp: {
+    if (checkArgCount(*this, TheCall, 3))
+      return true;
+    if (CheckVectorElementCallArgs(this, TheCall))
+      return true;
+    if (SemaBuiltinElementwiseTernaryMath(TheCall))
+      return true;
+    break;
+  }
   }
   return false;
 }
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
new file mode 100644
index 00000000000000..1f16dec68212e4
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+
+
+// CHECK-LABEL: builtin_lerp_half_scalar
+// CHECK: %3 = fsub double %conv1, %conv
+// CHECK: %4 = fmul double %conv2, %3
+// CHECK: %dx.lerp = fadd double %conv, %4
+// CHECK: %conv3 = fptrunc double %dx.lerp to half
+// CHECK: ret half %conv3
+half builtin_lerp_half_scalar (half p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+}
+
+// CHECK-LABEL: builtin_lerp_float_scalar
+// CHECK: %3 = fsub double %conv1, %conv
+// CHECK: %4 = fmul double %conv2, %3
+// CHECK: %dx.lerp = fadd double %conv, %4
+// CHECK: %conv3 = fptrunc double %dx.lerp to float
+// CHECK: ret float %conv3
+float builtin_lerp_float_scalar ( float p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+}
+
+// CHECK-LABEL: builtin_lerp_half_vector
+// CHECK: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// CHECK: ret <3 x half> %dx.lerp
+half3 builtin_lerp_half_vector (half3 p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+}
+
+// CHECK-LABEL: builtin_lerp_floar_vector
+// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %dx.lerp
+float2 builtin_lerp_floar_vector ( float2 p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+}
diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
new file mode 100644
index 00000000000000..1297f6b85bbd48
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
@@ -0,0 +1,105 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: %3 = fsub half %1, %0
+// NATIVE_HALF: %4 = fmul half %2, %3
+// NATIVE_HALF: %dx.lerp = fadd half %0, %4
+// NATIVE_HALF: ret half %dx.lerp
+// NO_HALF: %3 = fsub float %1, %0
+// NO_HALF: %4 = fmul float %2, %3
+// NO_HALF: %dx.lerp = fadd float %0, %4
+// NO_HALF: ret float %dx.lerp
+half test_lerp_half ( half p0) {
+  return lerp ( p0, p0, p0 );
+}
+
+// NATIVE_HALF: %dx.lerp = call <2 x half> @llvm.dx.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+// NATIVE_HALF: ret <2 x half> %dx.lerp
+// NO_HALF: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// NO_HALF: ret <2 x float> %dx.lerp
+half2  test_lerp_half2 ( half2 p0, half2 p1 ) {
+  return lerp ( p0, p0, p0 );
+}
+
+// NATIVE_HALF: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// NATIVE_HALF: ret <3 x half> %dx.lerp
+// NO_HALF: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// NO_HALF: ret <3 x float> %dx.lerp
+half3  test_lerp_half3 ( half3 p0, half3 p1 ) {
+  return lerp ( p0, p0, p0 );
+}
+
+// NATIVE_HALF: %dx.lerp = call <4 x half> @llvm.dx.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
+// NATIVE_HALF: ret <4 x half> %dx.lerp
+// NO_HALF: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// NO_HALF: ret <4 x float> %dx.lerp
+half4  test_lerp_half4 ( half4 p0, half4 p1 ) {
+  return lerp ( p0, p0, p0 );
+}
+
+// CHECK: %3 = fsub float %1, %0
+// CHECK: %4 = fmul float %2, %3
+// CHECK: %dx.lerp = fadd float %0, %4
+// CHECK: ret float %dx.lerp
+float  test_lerp_float ( float p0, float p1 ) {
+  return lerp ( p0, p0, p0 );
+}
+
+// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %dx.lerp
+float2  test_lerp_float2 ( float2 p0, float2 p1 ) {
+  return lerp ( p0, p0, p0 );
+}
+
+// CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %dx.lerp
+float3  test_lerp_float3 ( float3 p0, float3 p1 ) {
+  return lerp ( p0, p0, p0 );
+}
+
+// CHECK: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// CHECK: ret <4 x float> %dx.lerp
+float4  test_lerp_float4 ( float4 p0, float4 p1) {
+  return lerp ( p0, p0, p0 );
+}
+
+// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %dx.lerp
+float2  test_lerp_float2_splat ( float p0, float2 p1 ) {
+  return lerp( p0, p1, p1 );
+}
+
+// CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %dx.lerp
+float3  test_lerp_float3_splat ( float p0, float3 p1 ) {
+  return lerp( p0, p1, p1 );
+}
+
+// CHECK:  %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
+// CHECK:  ret <4 x float> %dx.lerp
+float4  test_lerp_float4_splat ( float p0, float4 p1 ) {
+  return lerp( p0, p1, p1 );
+}
+
+// CHECK: %conv = sitofp i32 %2 to float
+// CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
+// CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
+// CHECK: ret <2 x float> %dx.lerp
+float2 test_lerp_float2_int_splat ( float2 p0, int p1 ) {
+  return lerp ( p0, p0, p1 );
+}
+
+// CHECK: %conv = sitofp i32 %2 to float
+// CHECK: %splat.splatinsert = insertelement <3 x float> poison, float %conv, i64 0
+// CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
+// CHECK:  %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
+// CHECK: ret <3 x float> %dx.lerp
+float3 test_lerp_float3_int_splat ( float3 p0, int p1 ) {
+  return lerp ( p0, p0, p1 );
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
index 54d093aa7ce3a4..5dbb52a80c6bd0 100644
--- a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
@@ -22,7 +22,7 @@ float test_dot_vector_size_mismatch ( float3 p0, float2 p1 ) {
 
 float test_dot_builtin_vector_size_mismatch ( float3 p0, float2 p1 ) {
   return __builtin_hlsl_dot ( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
 float test_dot_scalar_mismatch ( float p0, int p1 ) {
@@ -38,69 +38,69 @@ float test_dot_element_type_mismatch ( int2 p0, float2 p1 ) {
 //NOTE: for all the *_promotion we are intentionally not handling type promotion in builtins
 float test_builtin_dot_vec_int_to_float_promotion ( int2 p0, float2 p1 ) {
   return __builtin_hlsl_dot ( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
 int64_t test_builtin_dot_vec_int_to_int64_promotion( int64_t2 p0, int2 p1 ) {
   return __builtin_hlsl_dot( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
 float test_builtin_dot_vec_half_to_float_promotion( float2 p0, half2 p1 ) {
   return __builtin_hlsl_dot( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
 #ifdef __HLSL_ENABLE_16_BIT
 float test_builtin_dot_vec_int16_to_float_promotion( float2 p0, int16_t2 p1 ) {
   return __builtin_hlsl_dot( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
 half test_builtin_dot_vec_int16_to_half_promotion( half2 p0, int16_t2 p1 ) {
   return __builtin_hlsl_dot( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
 int test_builtin_dot_vec_int16_to_int_promotion( int2 p0, int16_t2 p1 ) {
   return __builtin_hlsl_dot( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
 int64_t test_builtin_dot_vec_int16_to_int64_promotion( int64_t2 p0, int16_t2 p1 ) {
   return __builtin_hlsl_dot( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 #endif
 
 float test_builtin_dot_float2_splat ( float p0, float2 p1 ) {
   return __builtin_hlsl_dot( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
 float test_builtin_dot_float3_splat ( float p0, float3 p1 ) {
   return __builtin_hlsl_dot( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
 float test_builtin_dot_float4_splat ( float p0, float4 p1 ) {
   return __builtin_hlsl_dot( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
 float test_dot_float2_int_splat ( float2 p0, int p1 ) {
   return __builtin_hlsl_dot ( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
 float test_dot_float3_int_splat ( float3 p0, int p1 ) {
   return __builtin_hlsl_dot ( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
 float test_builtin_dot_int_vect_to_float_vec_promotion ( int2 p0, float p1 ) {
   return __builtin_hlsl_dot ( p0, p1 );
-  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
 int test_builtin_dot_bool_type_promotion ( bool p0, bool p1 ) {
diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
new file mode 100644
index 00000000000000..c062aa60a8df45
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float2 test_no_second_arg ( float2 p0) {
+  return __builtin_hlsl_lerp ( p0 );
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+float2 test_no_third_arg ( float2 p0) {
+  return __builtin_hlsl_lerp ( p0, p0 );
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+float2 test_too_many_arg ( float2 p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0, p0 );
+  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+float2 test_lerp_no_second_arg ( float2 p0) {
+  return lerp ( p0 );
+  // expected-error@-1 {{no matching function for call to 'lerp'}}
+}
+
+float2 test_lerp_vector_size_mismatch ( float3 p0, float2 p1 ) {
+  return lerp ( p0, p0, p1 );
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
+}
+
+float test_lerp_builtin_vector_size_mismatch ( float3 p0, float2 p1 ) {
+  return __builtin_hlsl_lerp ( p0, p1, p1 );
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must have vectors of the same type}}
+}
+
+float test_lerp_scalar_mismatch ( float p0, half p1 ) {
+  return lerp ( p1, p0, p1 );
+  // expected-error@-1 {{call to 'lerp' is ambiguous}}
+}
+
+float test_lerp_element_type_mismatch ( half2 p0, float2 p1 ) {
+  return lerp ( p1, p0, p1 );
+  // expected-error@-1 {{call to 'lerp' is ambiguous}}
+}
+
+float test_builtin_lerp_float2_splat ( float p0, float2 p1 ) {
+  return __builtin_hlsl_lerp( p0, p1, p1 );
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float test_builtin_lerp_float3_splat ( float p0, float3 p1 ) {
+  return  __builtin_hlsl_lerp( p0, p1, p1 );
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float test_builtin_lerp_float4_splat ( float p0, float4 p1 ) {
+  return  __builtin_hlsl_lerp( p0, p1, p1 );
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float test_lerp_float2_int_splat ( float2 p0, int p1 ) {
+  return  __builtin_hlsl_lerp( p0, p1, p1 );
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float test_lerp_float3_int_splat ( float3 p0, int p1 ) {
+  return  __builtin_hlsl_lerp( p0, p1, p1 );
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float test_builtin_lerp_int_vect_to_float_vec_promotion ( int2 p0, float p1 ) {
+  return  __builtin_hlsl_lerp( p0, p1, p1 );
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+int test_builtin_lerp_bool_type_promotion (bool p0) {
+  return  __builtin_hlsl_lerp( p0, p0, p0 );
+  // expected-error@-1 {{1st argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion ( float p0, bool p1 ) {
+  return __builtin_hlsl_lerp ( p0, p0, p1 );
+   // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion2 ( bool p0, float p1 ) {
+  return __builtin_hlsl_lerp ( p1, p0, p1 );
+  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_lerp_int_to_float_promotion ( float p0, int p1 ) {
+  return __builtin_hlsl_lerp ( p0, p0, p1 );
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
+}
\ No newline at end of file
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index c192d4b84417c9..48332b917693a0 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -24,4 +24,9 @@ def int_dx_dot :
     Intrinsic<[LLVMVectorElementType<0>], 
     [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
     [IntrNoMem, IntrWillReturn, Commutative] >;
+
+def int_dx_lerp :
+    Intrinsic<[LLVMMatchType<0>], 
+    [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem, IntrWillReturn, Commutative] >;
 }

From 37dca605c9bd41732da010ee97ed15ad9585a37d Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 29 Feb 2024 10:12:22 -0500
Subject: [PATCH 27/55] [libc++] Clean up includes of <__assert> (#80091)

Originally, we used __libcpp_verbose_abort to handle assertion failures.
That function was declared from all public headers. Since we don't use
that mechanism anymore, we don't need to declare __libcpp_verbose_abort
from all public headers, and we can clean up a lot of unnecessary
includes.

This patch also moves the definition of the various assertion categories
to the <__assert> header, since we now rely on regular IWYU for these
assertion macros.

rdar://105510916
---
 libcxx/include/__assert                       | 81 +++++++++++++++++++
 libcxx/include/__atomic/atomic_flag.h         |  1 +
 .../include/__charconv/from_chars_integral.h  |  1 +
 libcxx/include/__charconv/to_chars_base_10.h  |  1 +
 libcxx/include/__charconv/to_chars_integral.h |  1 +
 libcxx/include/__charconv/traits.h            |  1 +
 libcxx/include/__config                       | 81 -------------------
 .../__format/formatter_floating_point.h       |  1 +
 .../include/__numeric/saturation_arithmetic.h |  1 +
 .../__random/negative_binomial_distribution.h |  1 +
 libcxx/include/__ranges/repeat_view.h         |  1 +
 libcxx/include/__stop_token/stop_state.h      |  1 +
 libcxx/include/__string/char_traits.h         |  1 +
 libcxx/include/algorithm                      |  1 -
 libcxx/include/any                            |  1 -
 libcxx/include/array                          |  2 +-
 libcxx/include/atomic                         |  1 -
 libcxx/include/barrier                        |  2 +-
 libcxx/include/bit                            |  1 -
 libcxx/include/bitset                         |  1 -
 libcxx/include/cassert                        |  1 -
 libcxx/include/ccomplex                       |  1 -
 libcxx/include/cctype                         |  1 -
 libcxx/include/cerrno                         |  1 -
 libcxx/include/cfenv                          |  1 -
 libcxx/include/cfloat                         |  1 -
 libcxx/include/charconv                       |  1 -
 libcxx/include/chrono                         |  1 -
 libcxx/include/cinttypes                      |  1 -
 libcxx/include/ciso646                        |  1 -
 libcxx/include/climits                        |  1 -
 libcxx/include/clocale                        |  1 -
 libcxx/include/cmath                          |  1 -
 libcxx/include/codecvt                        |  1 -
 libcxx/include/compare                        |  1 -
 libcxx/include/complex                        |  1 -
 libcxx/include/concepts                       |  1 -
 libcxx/include/condition_variable             |  1 -
 libcxx/include/coroutine                      |  1 -
 libcxx/include/csetjmp                        |  1 -
 libcxx/include/csignal                        |  1 -
 libcxx/include/cstdarg                        |  1 -
 libcxx/include/cstdbool                       |  1 -
 libcxx/include/cstddef                        |  1 -
 libcxx/include/cstdint                        |  1 -
 libcxx/include/cstdio                         |  1 -
 libcxx/include/cstdlib                        |  1 -
 libcxx/include/cstring                        |  1 -
 libcxx/include/ctgmath                        |  1 -
 libcxx/include/ctime                          |  1 -
 libcxx/include/cuchar                         |  1 -
 libcxx/include/cwchar                         |  1 -
 libcxx/include/cwctype                        |  1 -
 libcxx/include/deque                          |  2 +-
 libcxx/include/exception                      |  1 -
 libcxx/include/execution                      |  1 -
 libcxx/include/expected                       |  1 -
 libcxx/include/experimental/__simd/scalar.h   |  1 +
 libcxx/include/experimental/__simd/vec_ext.h  |  1 +
 libcxx/include/experimental/iterator          |  1 -
 libcxx/include/experimental/propagate_const   |  1 -
 libcxx/include/experimental/simd              |  2 -
 libcxx/include/experimental/type_traits       |  1 -
 libcxx/include/experimental/utility           |  1 -
 libcxx/include/ext/hash_map                   |  1 -
 libcxx/include/ext/hash_set                   |  1 -
 libcxx/include/filesystem                     |  1 -
 libcxx/include/format                         |  1 -
 libcxx/include/forward_list                   |  1 -
 libcxx/include/fstream                        |  2 +-
 libcxx/include/functional                     |  1 -
 libcxx/include/future                         |  2 +-
 libcxx/include/initializer_list               |  1 -
 libcxx/include/iomanip                        |  1 -
 libcxx/include/ios                            |  1 -
 libcxx/include/iosfwd                         |  1 -
 libcxx/include/iostream                       |  1 -
 libcxx/include/istream                        |  1 -
 libcxx/include/iterator                       |  1 -
 libcxx/include/latch                          |  2 +-
 libcxx/include/limits                         |  1 -
 libcxx/include/list                           |  2 +-
 libcxx/include/locale                         |  2 +-
 libcxx/include/map                            |  2 +-
 libcxx/include/memory                         |  1 -
 libcxx/include/mutex                          |  1 -
 libcxx/include/new                            |  2 +-
 libcxx/include/numbers                        |  1 -
 libcxx/include/numeric                        |  1 -
 libcxx/include/optional                       |  2 +-
 libcxx/include/ostream                        |  1 -
 libcxx/include/print                          |  2 +-
 libcxx/include/queue                          |  1 -
 libcxx/include/random                         |  1 -
 libcxx/include/ranges                         |  1 -
 libcxx/include/ratio                          |  1 -
 libcxx/include/regex                          |  2 +-
 libcxx/include/scoped_allocator               |  1 -
 libcxx/include/semaphore                      |  2 +-
 libcxx/include/set                            |  2 +-
 libcxx/include/shared_mutex                   |  1 -
 libcxx/include/span                           |  2 +-
 libcxx/include/sstream                        |  1 -
 libcxx/include/stack                          |  1 -
 libcxx/include/stdexcept                      |  1 -
 libcxx/include/stop_token                     |  1 -
 libcxx/include/streambuf                      |  1 -
 libcxx/include/string                         |  2 +-
 libcxx/include/string_view                    |  2 +-
 libcxx/include/strstream                      |  1 -
 libcxx/include/system_error                   |  1 -
 libcxx/include/thread                         |  1 -
 libcxx/include/tuple                          |  1 -
 libcxx/include/type_traits                    |  2 +-
 libcxx/include/typeindex                      |  1 -
 libcxx/include/typeinfo                       |  1 -
 libcxx/include/unordered_map                  |  2 +-
 libcxx/include/unordered_set                  |  2 +-
 libcxx/include/utility                        |  1 -
 libcxx/include/valarray                       |  2 +-
 libcxx/include/variant                        |  1 -
 libcxx/include/vector                         |  2 +-
 libcxx/include/version                        |  1 -
 ...customize_verbose_abort.link-time.pass.cpp |  1 +
 .../headers_declare_verbose_abort.gen.py      | 33 --------
 .../libcxx/assertions/modes/none.pass.cpp     |  1 +
 .../generate_feature_test_macro_components.py |  1 -
 127 files changed, 119 insertions(+), 224 deletions(-)
 delete mode 100644 libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py

diff --git a/libcxx/include/__assert b/libcxx/include/__assert
index eb862b5369b258..49769fb4d44978 100644
--- a/libcxx/include/__assert
+++ b/libcxx/include/__assert
@@ -34,4 +34,85 @@
 #  define _LIBCPP_ASSUME(expression) ((void)0)
 #endif
 
+// clang-format off
+// Fast hardening mode checks.
+
+#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_FAST
+
+// Enabled checks.
+#  define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message)       _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)    _LIBCPP_ASSERT(expression, message)
+// Disabled checks.
+// On most modern platforms, dereferencing a null pointer does not lead to an actual memory access.
+#  define _LIBCPP_ASSERT_NON_NULL(expression, message)                _LIBCPP_ASSUME(expression)
+// Overlapping ranges will make algorithms produce incorrect results but don't directly lead to a security
+// vulnerability.
+#  define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)  _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)      _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message) _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)    _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)  _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_PEDANTIC(expression, message)                _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)    _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_INTERNAL(expression, message)                _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)           _LIBCPP_ASSUME(expression)
+
+// Extensive hardening mode checks.
+
+#elif _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_EXTENSIVE
+
+// Enabled checks.
+#  define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message)       _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)    _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_NON_NULL(expression, message)                _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)  _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)      _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message) _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)    _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)  _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_PEDANTIC(expression, message)                _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)           _LIBCPP_ASSERT(expression, message)
+// Disabled checks.
+#  define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)    _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_INTERNAL(expression, message)                _LIBCPP_ASSUME(expression)
+
+// Debug hardening mode checks.
+
+#elif _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+
+// All checks enabled.
+#  define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message)       _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)    _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_NON_NULL(expression, message)                _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)  _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)      _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message) _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)    _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)  _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_PEDANTIC(expression, message)                _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)    _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_INTERNAL(expression, message)                _LIBCPP_ASSERT(expression, message)
+#  define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)           _LIBCPP_ASSERT(expression, message)
+
+// Disable all checks if hardening is not enabled.
+
+#else
+
+// All checks disabled.
+#  define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message)       _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)    _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_NON_NULL(expression, message)                _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)  _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)      _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message) _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)    _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)  _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_PEDANTIC(expression, message)                _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)    _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_INTERNAL(expression, message)                _LIBCPP_ASSUME(expression)
+#  define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)           _LIBCPP_ASSUME(expression)
+
+#endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_FAST
+// clang-format on
+
 #endif // _LIBCPP___ASSERT
diff --git a/libcxx/include/__atomic/atomic_flag.h b/libcxx/include/__atomic/atomic_flag.h
index a45a7183547726..18a864523de06f 100644
--- a/libcxx/include/__atomic/atomic_flag.h
+++ b/libcxx/include/__atomic/atomic_flag.h
@@ -13,6 +13,7 @@
 #include <__atomic/contention_t.h>
 #include <__atomic/cxx_atomic_impl.h>
 #include <__atomic/memory_order.h>
+#include <__availability>
 #include <__chrono/duration.h>
 #include <__config>
 #include <__thread/support.h>
diff --git a/libcxx/include/__charconv/from_chars_integral.h b/libcxx/include/__charconv/from_chars_integral.h
index e969cedb33cbe4..c1f033b37b913e 100644
--- a/libcxx/include/__charconv/from_chars_integral.h
+++ b/libcxx/include/__charconv/from_chars_integral.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___CHARCONV_FROM_CHARS_INTEGRAL_H
 
 #include <__algorithm/copy_n.h>
+#include <__assert>
 #include <__charconv/from_chars_result.h>
 #include <__charconv/traits.h>
 #include <__config>
diff --git a/libcxx/include/__charconv/to_chars_base_10.h b/libcxx/include/__charconv/to_chars_base_10.h
index 0dee351521f9c6..c49f4f6797aa43 100644
--- a/libcxx/include/__charconv/to_chars_base_10.h
+++ b/libcxx/include/__charconv/to_chars_base_10.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___CHARCONV_TO_CHARS_BASE_10_H
 
 #include <__algorithm/copy_n.h>
+#include <__assert>
 #include <__charconv/tables.h>
 #include <__config>
 #include <cstdint>
diff --git a/libcxx/include/__charconv/to_chars_integral.h b/libcxx/include/__charconv/to_chars_integral.h
index 40fbe334d8d54c..0369f4dfb9bda6 100644
--- a/libcxx/include/__charconv/to_chars_integral.h
+++ b/libcxx/include/__charconv/to_chars_integral.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___CHARCONV_TO_CHARS_INTEGRAL_H
 
 #include <__algorithm/copy_n.h>
+#include <__assert>
 #include <__bit/countl.h>
 #include <__charconv/tables.h>
 #include <__charconv/to_chars_base_10.h>
diff --git a/libcxx/include/__charconv/traits.h b/libcxx/include/__charconv/traits.h
index b4907c3f775715..c91c6da3247978 100644
--- a/libcxx/include/__charconv/traits.h
+++ b/libcxx/include/__charconv/traits.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___CHARCONV_TRAITS
 #define _LIBCPP___CHARCONV_TRAITS
 
+#include <__assert>
 #include <__bit/countl.h>
 #include <__charconv/tables.h>
 #include <__charconv/to_chars_base_10.h>
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 53ff113a16b2a8..8d4d17378b2973 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -345,87 +345,6 @@ _LIBCPP_HARDENING_MODE_EXTENSIVE, \
 _LIBCPP_HARDENING_MODE_DEBUG
 #  endif
 
-// clang-format off
-// Fast hardening mode checks.
-
-#  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_FAST
-
-// Enabled checks.
-#    define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message)        _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)     _LIBCPP_ASSERT(expression, message)
-// Disabled checks.
-// On most modern platforms, dereferencing a null pointer does not lead to an actual memory access.
-#    define _LIBCPP_ASSERT_NON_NULL(expression, message)                 _LIBCPP_ASSUME(expression)
-// Overlapping ranges will make algorithms produce incorrect results but don't directly lead to a security
-// vulnerability.
-#    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)   _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)       _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message)  _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)     _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)   _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                 _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)     _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_INTERNAL(expression, message)                 _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)            _LIBCPP_ASSUME(expression)
-
-// Extensive hardening mode checks.
-
-#  elif _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_EXTENSIVE
-
-// Enabled checks.
-#    define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message)        _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)     _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_NON_NULL(expression, message)                 _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)   _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)       _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message)  _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)     _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)   _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                 _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)            _LIBCPP_ASSERT(expression, message)
-// Disabled checks.
-#    define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)     _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_INTERNAL(expression, message)                 _LIBCPP_ASSUME(expression)
-
-// Debug hardening mode checks.
-
-#  elif _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
-
-// All checks enabled.
-#    define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message)         _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)      _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_NON_NULL(expression, message)                  _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)    _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)        _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message)   _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)      _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)    _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                  _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)      _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_INTERNAL(expression, message)                  _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)             _LIBCPP_ASSERT(expression, message)
-
-// Disable all checks if hardening is not enabled.
-
-#  else
-
-// All checks disabled.
-#    define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message)         _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)      _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_NON_NULL(expression, message)                  _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)    _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)        _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message)   _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)      _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)    _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                  _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)      _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_INTERNAL(expression, message)                  _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)             _LIBCPP_ASSUME(expression)
-
-#  endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_FAST
-// clang-format on
-
 // } HARDENING
 
 #  define _LIBCPP_TOSTRING2(x) #x
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index 6802a8b7bd4ca3..f01d323efff5fc 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -16,6 +16,7 @@
 #include <__algorithm/min.h>
 #include <__algorithm/rotate.h>
 #include <__algorithm/transform.h>
+#include <__assert>
 #include <__charconv/chars_format.h>
 #include <__charconv/to_chars_floating_point.h>
 #include <__charconv/to_chars_result.h>
diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h
index 0e6f455cf22825..41596a0c58e27d 100644
--- a/libcxx/include/__numeric/saturation_arithmetic.h
+++ b/libcxx/include/__numeric/saturation_arithmetic.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H
 #define _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H
 
+#include <__assert>
 #include <__concepts/arithmetic.h>
 #include <__config>
 #include <__utility/cmp.h>
diff --git a/libcxx/include/__random/negative_binomial_distribution.h b/libcxx/include/__random/negative_binomial_distribution.h
index eed4f511e87190..6d0055d01ed432 100644
--- a/libcxx/include/__random/negative_binomial_distribution.h
+++ b/libcxx/include/__random/negative_binomial_distribution.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___RANDOM_NEGATIVE_BINOMIAL_DISTRIBUTION_H
 #define _LIBCPP___RANDOM_NEGATIVE_BINOMIAL_DISTRIBUTION_H
 
+#include <__assert>
 #include <__config>
 #include <__random/bernoulli_distribution.h>
 #include <__random/gamma_distribution.h>
diff --git a/libcxx/include/__ranges/repeat_view.h b/libcxx/include/__ranges/repeat_view.h
index d08f0e0d4e9f74..620a2645497285 100644
--- a/libcxx/include/__ranges/repeat_view.h
+++ b/libcxx/include/__ranges/repeat_view.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___RANGES_REPEAT_VIEW_H
 #define _LIBCPP___RANGES_REPEAT_VIEW_H
 
+#include <__assert>
 #include <__concepts/constructible.h>
 #include <__concepts/same_as.h>
 #include <__concepts/semiregular.h>
diff --git a/libcxx/include/__stop_token/stop_state.h b/libcxx/include/__stop_token/stop_state.h
index 462aa73952b84f..df07573f878628 100644
--- a/libcxx/include/__stop_token/stop_state.h
+++ b/libcxx/include/__stop_token/stop_state.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___STOP_TOKEN_STOP_STATE_H
 #define _LIBCPP___STOP_TOKEN_STOP_STATE_H
 
+#include <__assert>
 #include <__availability>
 #include <__config>
 #include <__stop_token/atomic_unique_lock.h>
diff --git a/libcxx/include/__string/char_traits.h b/libcxx/include/__string/char_traits.h
index 8ea9625d071834..5880d3a22db2e7 100644
--- a/libcxx/include/__string/char_traits.h
+++ b/libcxx/include/__string/char_traits.h
@@ -14,6 +14,7 @@
 #include <__algorithm/find_end.h>
 #include <__algorithm/find_first_of.h>
 #include <__algorithm/min.h>
+#include <__assert>
 #include <__compare/ordering.h>
 #include <__config>
 #include <__functional/hash.h>
diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm
index 70e30bc87e8128..0f62de7fa83f98 100644
--- a/libcxx/include/algorithm
+++ b/libcxx/include/algorithm
@@ -1793,7 +1793,6 @@ template <class BidirectionalIterator, class Compare>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <version>
 
diff --git a/libcxx/include/any b/libcxx/include/any
index 378dfb6e21b536..ce54803cd91b5b 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -80,7 +80,6 @@ namespace std {
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__config>
 #include <__memory/allocator.h>
diff --git a/libcxx/include/array b/libcxx/include/array
index 41f016a4859a32..961b620efb9357 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -116,7 +116,7 @@ template <size_t I, class T, size_t N> const T&& get(const array<T, N>&&) noexce
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/swap_ranges.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__config>
 #include <__fwd/array.h>
 #include <__iterator/reverse_iterator.h>
diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index 2dac69377b77f0..61ff61d415dd84 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -587,7 +587,6 @@ template <class T>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__atomic/aliases.h>
 #include <__atomic/atomic.h>
 #include <__atomic/atomic_base.h>
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index f91452c8d0064c..c5fd84b91925b1 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -51,7 +51,7 @@ namespace std
 #  error "<barrier> is not supported since libc++ has been configured without support for threads."
 #endif
 
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__atomic/atomic_base.h>
 #include <__atomic/memory_order.h>
 #include <__availability>
diff --git a/libcxx/include/bit b/libcxx/include/bit
index 84e2080377e4fa..b8e4bdc2dfe202 100644
--- a/libcxx/include/bit
+++ b/libcxx/include/bit
@@ -61,7 +61,6 @@ namespace std {
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__bit/bit_cast.h>
 #include <__bit/bit_ceil.h>
 #include <__bit/bit_floor.h>
diff --git a/libcxx/include/bitset b/libcxx/include/bitset
index 95f7a63b23179c..8818ab6563b570 100644
--- a/libcxx/include/bitset
+++ b/libcxx/include/bitset
@@ -129,7 +129,6 @@ template <size_t N> struct hash<std::bitset<N>>;
 #include <__algorithm/count.h>
 #include <__algorithm/fill.h>
 #include <__algorithm/find.h>
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__bit_reference>
 #include <__config>
 #include <__functional/hash.h>
diff --git a/libcxx/include/cassert b/libcxx/include/cassert
index 761f57dee1db57..6fec37dc637610 100644
--- a/libcxx/include/cassert
+++ b/libcxx/include/cassert
@@ -16,7 +16,6 @@ Macros:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 // <assert.h> is not provided by libc++
diff --git a/libcxx/include/ccomplex b/libcxx/include/ccomplex
index cf05c7a9108141..94d2c8d7d003d4 100644
--- a/libcxx/include/ccomplex
+++ b/libcxx/include/ccomplex
@@ -17,7 +17,6 @@
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <complex>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/cctype b/libcxx/include/cctype
index 32be6f38e5f898..d7af7e084aa23a 100644
--- a/libcxx/include/cctype
+++ b/libcxx/include/cctype
@@ -34,7 +34,6 @@ int toupper(int c);
 }  // std
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #include <ctype.h>
diff --git a/libcxx/include/cerrno b/libcxx/include/cerrno
index 937ec23c6971ad..d488fa72a54b7a 100644
--- a/libcxx/include/cerrno
+++ b/libcxx/include/cerrno
@@ -22,7 +22,6 @@ Macros:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #include <errno.h>
diff --git a/libcxx/include/cfenv b/libcxx/include/cfenv
index 16b3761ee27b16..f8cacd562f76bd 100644
--- a/libcxx/include/cfenv
+++ b/libcxx/include/cfenv
@@ -52,7 +52,6 @@ int feupdateenv(const fenv_t* envp);
 }  // std
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #include <fenv.h>
diff --git a/libcxx/include/cfloat b/libcxx/include/cfloat
index 4f991dd49ff4f8..5d1b38c557dcad 100644
--- a/libcxx/include/cfloat
+++ b/libcxx/include/cfloat
@@ -69,7 +69,6 @@ Macros:
     LDBL_TRUE_MIN       // C11
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #include <float.h>
diff --git a/libcxx/include/charconv b/libcxx/include/charconv
index 5a2869acba8715..5bc7b9011be024 100644
--- a/libcxx/include/charconv
+++ b/libcxx/include/charconv
@@ -69,7 +69,6 @@ namespace std {
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__charconv/chars_format.h>
 #include <__charconv/from_chars_integral.h>
 #include <__charconv/from_chars_result.h>
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index fe73f7c772b996..b3b260c2a998e6 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -825,7 +825,6 @@ constexpr chrono::year                                  operator ""y(unsigned lo
 
 // clang-format on
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__chrono/calendar.h>
 #include <__chrono/convert_to_timespec.h>
 #include <__chrono/convert_to_tm.h>
diff --git a/libcxx/include/cinttypes b/libcxx/include/cinttypes
index a5b9558abde12d..52663a4f35fad5 100644
--- a/libcxx/include/cinttypes
+++ b/libcxx/include/cinttypes
@@ -234,7 +234,6 @@ uintmax_t wcstoumax(const wchar_t* restrict nptr, wchar_t** restrict endptr, int
 }  // std
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 // standard-mandated includes
diff --git a/libcxx/include/ciso646 b/libcxx/include/ciso646
index e0cd722495ed0d..1d859f08fac572 100644
--- a/libcxx/include/ciso646
+++ b/libcxx/include/ciso646
@@ -15,7 +15,6 @@
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/climits b/libcxx/include/climits
index 2e8993e4d6a519..bcd8b4a56a073c 100644
--- a/libcxx/include/climits
+++ b/libcxx/include/climits
@@ -37,7 +37,6 @@ Macros:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #include <limits.h>
diff --git a/libcxx/include/clocale b/libcxx/include/clocale
index e2ace355d7b648..c689a64be288a3 100644
--- a/libcxx/include/clocale
+++ b/libcxx/include/clocale
@@ -34,7 +34,6 @@ lconv* localeconv();
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #include <locale.h>
diff --git a/libcxx/include/cmath b/libcxx/include/cmath
index 798ddb4963b0ec..dd194bbb558969 100644
--- a/libcxx/include/cmath
+++ b/libcxx/include/cmath
@@ -304,7 +304,6 @@ constexpr long double lerp(long double a, long double b, long double t) noexcept
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_arithmetic.h>
diff --git a/libcxx/include/codecvt b/libcxx/include/codecvt
index 504dd71f300405..b7182ff471559d 100644
--- a/libcxx/include/codecvt
+++ b/libcxx/include/codecvt
@@ -54,7 +54,6 @@ class codecvt_utf8_utf16
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__locale>
 #include <version>
diff --git a/libcxx/include/compare b/libcxx/include/compare
index cc0cae8a544d62..93953254b78436 100644
--- a/libcxx/include/compare
+++ b/libcxx/include/compare
@@ -140,7 +140,6 @@ namespace std {
 }
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__compare/common_comparison_category.h>
 #include <__compare/compare_partial_order_fallback.h>
 #include <__compare/compare_strong_order_fallback.h>
diff --git a/libcxx/include/complex b/libcxx/include/complex
index 0aba60e514ba22..e996485a38ae67 100644
--- a/libcxx/include/complex
+++ b/libcxx/include/complex
@@ -256,7 +256,6 @@ template<class T> complex<T> tanh (const complex<T>&);
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__fwd/complex.h>
 #include <__tuple/tuple_element.h>
diff --git a/libcxx/include/concepts b/libcxx/include/concepts
index 5fdf30ecfbd3fb..e10f5ab5ad8a18 100644
--- a/libcxx/include/concepts
+++ b/libcxx/include/concepts
@@ -129,7 +129,6 @@ namespace std {
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__concepts/arithmetic.h>
 #include <__concepts/assignable.h>
 #include <__concepts/boolean_testable.h>
diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable
index 6aac3c13ef4a74..4ded1140d46b1b 100644
--- a/libcxx/include/condition_variable
+++ b/libcxx/include/condition_variable
@@ -118,7 +118,6 @@ public:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__chrono/duration.h>
 #include <__chrono/steady_clock.h>
diff --git a/libcxx/include/coroutine b/libcxx/include/coroutine
index f264570128bb80..4bd1d4e9c3103a 100644
--- a/libcxx/include/coroutine
+++ b/libcxx/include/coroutine
@@ -38,7 +38,6 @@ struct suspend_always;
 
  */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__coroutine/coroutine_handle.h>
 #include <__coroutine/coroutine_traits.h>
diff --git a/libcxx/include/csetjmp b/libcxx/include/csetjmp
index 9012cad22ebe74..7ba90068710aea 100644
--- a/libcxx/include/csetjmp
+++ b/libcxx/include/csetjmp
@@ -30,7 +30,6 @@ void longjmp(jmp_buf env, int val);
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 // <setjmp.h> is not provided by libc++
diff --git a/libcxx/include/csignal b/libcxx/include/csignal
index cf45f507535e1d..804a7f95ae9682 100644
--- a/libcxx/include/csignal
+++ b/libcxx/include/csignal
@@ -39,7 +39,6 @@ int raise(int sig);
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 // <signal.h> is not provided by libc++
diff --git a/libcxx/include/cstdarg b/libcxx/include/cstdarg
index 3a4291f4584aa1..4642eb7b5258ca 100644
--- a/libcxx/include/cstdarg
+++ b/libcxx/include/cstdarg
@@ -31,7 +31,6 @@ Types:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 // <stdarg.h> is not provided by libc++
diff --git a/libcxx/include/cstdbool b/libcxx/include/cstdbool
index ce608033a22ce1..ef731c021a4ab8 100644
--- a/libcxx/include/cstdbool
+++ b/libcxx/include/cstdbool
@@ -19,7 +19,6 @@ Macros:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/cstddef b/libcxx/include/cstddef
index 1d7bac24c81eaa..ed16ae44fb2bf8 100644
--- a/libcxx/include/cstddef
+++ b/libcxx/include/cstddef
@@ -33,7 +33,6 @@ Types:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
diff --git a/libcxx/include/cstdint b/libcxx/include/cstdint
index 829d9398f387a8..8c4782859426dd 100644
--- a/libcxx/include/cstdint
+++ b/libcxx/include/cstdint
@@ -140,7 +140,6 @@ Types:
 }  // std
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #include <stdint.h>
diff --git a/libcxx/include/cstdio b/libcxx/include/cstdio
index 0a867cec1a388b..7f94371081f8b1 100644
--- a/libcxx/include/cstdio
+++ b/libcxx/include/cstdio
@@ -95,7 +95,6 @@ void perror(const char* s);
 }  // std
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #include <stdio.h>
diff --git a/libcxx/include/cstdlib b/libcxx/include/cstdlib
index 9bf0ea3f73b169..c817fd8f4accda 100644
--- a/libcxx/include/cstdlib
+++ b/libcxx/include/cstdlib
@@ -81,7 +81,6 @@ void *aligned_alloc(size_t alignment, size_t size);                       // C11
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #include <stdlib.h>
diff --git a/libcxx/include/cstring b/libcxx/include/cstring
index a9bdf4ff2dfca7..c2c92b02e73cc1 100644
--- a/libcxx/include/cstring
+++ b/libcxx/include/cstring
@@ -56,7 +56,6 @@ size_t strlen(const char* s);
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__type_traits/is_constant_evaluated.h>
 
diff --git a/libcxx/include/ctgmath b/libcxx/include/ctgmath
index bfcf2f98d470c8..6237979be4906c 100644
--- a/libcxx/include/ctgmath
+++ b/libcxx/include/ctgmath
@@ -18,7 +18,6 @@
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <ccomplex>
 #include <cmath>
 
diff --git a/libcxx/include/ctime b/libcxx/include/ctime
index b61e19d6446ddc..f47b49a43e23ef 100644
--- a/libcxx/include/ctime
+++ b/libcxx/include/ctime
@@ -45,7 +45,6 @@ int timespec_get( struct timespec *ts, int base); // C++17
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 // <time.h> is not provided by libc++
diff --git a/libcxx/include/cuchar b/libcxx/include/cuchar
index 03b8c7d2a88bcf..f0015be275367d 100644
--- a/libcxx/include/cuchar
+++ b/libcxx/include/cuchar
@@ -36,7 +36,6 @@ size_t c32rtomb(char* s, char32_t c32, mbstate_t* ps);
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 
 #include <uchar.h>
diff --git a/libcxx/include/cwchar b/libcxx/include/cwchar
index 122af242880e04..7442438d8f447f 100644
--- a/libcxx/include/cwchar
+++ b/libcxx/include/cwchar
@@ -102,7 +102,6 @@ size_t wcsrtombs(char* restrict dst, const wchar_t** restrict src, size_t len,
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__type_traits/apply_cv.h>
 #include <__type_traits/is_constant_evaluated.h>
diff --git a/libcxx/include/cwctype b/libcxx/include/cwctype
index 5a2d2427d8471f..04abfabef57933 100644
--- a/libcxx/include/cwctype
+++ b/libcxx/include/cwctype
@@ -49,7 +49,6 @@ wctrans_t wctrans(const char* property);
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <cctype>
 
diff --git a/libcxx/include/deque b/libcxx/include/deque
index c539a06bdd95c0..85ea9c6f661ed6 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -188,7 +188,7 @@ template <class T, class Allocator, class Predicate>
 #include <__algorithm/remove.h>
 #include <__algorithm/remove_if.h>
 #include <__algorithm/unwrap_iter.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__config>
 #include <__format/enable_insertable.h>
diff --git a/libcxx/include/exception b/libcxx/include/exception
index 97fee977690d0a..5eff8e3f8a4bfa 100644
--- a/libcxx/include/exception
+++ b/libcxx/include/exception
@@ -76,7 +76,6 @@ template <class E> void rethrow_if_nested(const E& e);
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__exception/exception.h>
 #include <__exception/exception_ptr.h>
diff --git a/libcxx/include/execution b/libcxx/include/execution
index 56facc87379ef1..822ffa1fd3ebc4 100644
--- a/libcxx/include/execution
+++ b/libcxx/include/execution
@@ -32,7 +32,6 @@ namespace std {
 }
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/is_same.h>
diff --git a/libcxx/include/expected b/libcxx/include/expected
index 44d0ce6b00c81e..f455ab7d5d61c6 100644
--- a/libcxx/include/expected
+++ b/libcxx/include/expected
@@ -38,7 +38,6 @@ namespace std {
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__expected/bad_expected_access.h>
 #include <__expected/expected.h>
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index 717fd6cd92d710..aff2cd11cfcfaa 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SCALAR_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_SCALAR_H
 
+#include <__assert>
 #include <cstddef>
 #include <experimental/__config>
 #include <experimental/__simd/declaration.h>
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index 7883132ba6c0db..c9423df93cfacc 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_VEC_EXT_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_VEC_EXT_H
 
+#include <__assert>
 #include <__bit/bit_ceil.h>
 #include <__utility/forward.h>
 #include <__utility/integer_sequence.h>
diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator
index e9c1fb6924eced..de82da2d3d72bd 100644
--- a/libcxx/include/experimental/iterator
+++ b/libcxx/include/experimental/iterator
@@ -52,7 +52,6 @@ namespace std {
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__memory/addressof.h>
 #include <__type_traits/decay.h>
 #include <__utility/forward.h>
diff --git a/libcxx/include/experimental/propagate_const b/libcxx/include/experimental/propagate_const
index 06d7ba43daf1ca..8c2ceb9def3357 100644
--- a/libcxx/include/experimental/propagate_const
+++ b/libcxx/include/experimental/propagate_const
@@ -107,7 +107,6 @@
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__functional/operations.h>
 #include <__fwd/hash.h>
 #include <__type_traits/conditional.h>
diff --git a/libcxx/include/experimental/simd b/libcxx/include/experimental/simd
index adca9faa47bb06..fad6431d13a193 100644
--- a/libcxx/include/experimental/simd
+++ b/libcxx/include/experimental/simd
@@ -71,8 +71,6 @@ inline namespace parallelism_v2 {
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
diff --git a/libcxx/include/experimental/type_traits b/libcxx/include/experimental/type_traits
index 62f9574ec58f4f..37be434f8edd56 100644
--- a/libcxx/include/experimental/type_traits
+++ b/libcxx/include/experimental/type_traits
@@ -68,7 +68,6 @@ inline namespace fundamentals_v1 {
 
  */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <experimental/__config>
 
 #if _LIBCPP_STD_VER >= 14
diff --git a/libcxx/include/experimental/utility b/libcxx/include/experimental/utility
index c1bd9364fd51e4..8bd0a055b7783f 100644
--- a/libcxx/include/experimental/utility
+++ b/libcxx/include/experimental/utility
@@ -30,7 +30,6 @@ inline namespace fundamentals_v1 {
 
  */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <experimental/__config>
 #include <utility>
 
diff --git a/libcxx/include/ext/hash_map b/libcxx/include/ext/hash_map
index 7ac268d5dcbdec..7b5b31c4081788 100644
--- a/libcxx/include/ext/hash_map
+++ b/libcxx/include/ext/hash_map
@@ -201,7 +201,6 @@ template <class Key, class T, class Hash, class Pred, class Alloc>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__hash_table>
 #include <algorithm>
diff --git a/libcxx/include/ext/hash_set b/libcxx/include/ext/hash_set
index 79f0925f6f4c67..1ab259b59979f3 100644
--- a/libcxx/include/ext/hash_set
+++ b/libcxx/include/ext/hash_set
@@ -192,7 +192,6 @@ template <class Value, class Hash, class Pred, class Alloc>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__hash_table>
 #include <algorithm>
diff --git a/libcxx/include/filesystem b/libcxx/include/filesystem
index ec68354a9fc933..b344ed468082e8 100644
--- a/libcxx/include/filesystem
+++ b/libcxx/include/filesystem
@@ -533,7 +533,6 @@ inline constexpr bool std::ranges::enable_view<std::filesystem::recursive_direct
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__filesystem/copy_options.h>
 #include <__filesystem/directory_entry.h>
diff --git a/libcxx/include/format b/libcxx/include/format
index 64f6ba1d25284a..b2fe0053b974bb 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -188,7 +188,6 @@ namespace std {
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__format/buffer.h>
 #include <__format/concepts.h>
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index ffa390f42a1072..a62b171a46783b 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -199,7 +199,6 @@ template <class T, class Allocator, class Predicate>
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/min.h>
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__config>
 #include <__iterator/distance.h>
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index 203cc6dfb4b134..513c8dc2b127a4 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -187,7 +187,7 @@ typedef basic_fstream<wchar_t> wfstream;
 */
 
 #include <__algorithm/max.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__config>
 #include <__fwd/fstream.h>
diff --git a/libcxx/include/functional b/libcxx/include/functional
index fd99e11fb18180..a2774a48bda0ee 100644
--- a/libcxx/include/functional
+++ b/libcxx/include/functional
@@ -513,7 +513,6 @@ POLICY:  For non-variadic implementations, the number of arguments is limited
 */
 
 #include <__algorithm/search.h>
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__compare/compare_three_way.h>
 #include <__config>
 #include <__functional/binary_function.h>
diff --git a/libcxx/include/future b/libcxx/include/future
index 13828680f03335..fda1591818a667 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -368,7 +368,7 @@ template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>;
 #  error "<future> is not supported since libc++ has been configured without support for threads."
 #endif
 
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__chrono/duration.h>
 #include <__chrono/time_point.h>
diff --git a/libcxx/include/initializer_list b/libcxx/include/initializer_list
index 4c2a7925a57bbf..680ca1cd20d550 100644
--- a/libcxx/include/initializer_list
+++ b/libcxx/include/initializer_list
@@ -42,7 +42,6 @@ template<class E> const E* end(initializer_list<E> il) noexcept; // constexpr in
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <cstddef>
 
diff --git a/libcxx/include/iomanip b/libcxx/include/iomanip
index 867408affd22b6..fb4f15b9a58533 100644
--- a/libcxx/include/iomanip
+++ b/libcxx/include/iomanip
@@ -42,7 +42,6 @@ template <class charT, class traits, class Allocator>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <istream>
 #include <version>
diff --git a/libcxx/include/ios b/libcxx/include/ios
index 8465860d08dc14..4b1306fc2ad830 100644
--- a/libcxx/include/ios
+++ b/libcxx/include/ios
@@ -217,7 +217,6 @@ storage-class-specifier const error_category& iostream_category() noexcept;
 #  error "The iostreams library is not supported since libc++ has been configured without support for localization."
 #endif
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__fwd/ios.h>
 #include <__ios/fpos.h>
 #include <__locale>
diff --git a/libcxx/include/iosfwd b/libcxx/include/iosfwd
index e28998d004156d..1579fa12754daf 100644
--- a/libcxx/include/iosfwd
+++ b/libcxx/include/iosfwd
@@ -106,7 +106,6 @@ using wosyncstream = basic_osyncstream<wchar_t>;  // C++20
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__fwd/fstream.h>
 #include <__fwd/ios.h>
diff --git a/libcxx/include/iostream b/libcxx/include/iostream
index 568ce8caed6ef1..5df45c6d3f78e7 100644
--- a/libcxx/include/iostream
+++ b/libcxx/include/iostream
@@ -33,7 +33,6 @@ extern wostream wclog;
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <version>
 
diff --git a/libcxx/include/istream b/libcxx/include/istream
index 7975a9e599a5b6..3f20c355046cea 100644
--- a/libcxx/include/istream
+++ b/libcxx/include/istream
@@ -158,7 +158,6 @@ template <class Stream, class T>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__fwd/istream.h>
 #include <__iterator/istreambuf_iterator.h>
diff --git a/libcxx/include/iterator b/libcxx/include/iterator
index 2f9280742370a2..5779bf828711b8 100644
--- a/libcxx/include/iterator
+++ b/libcxx/include/iterator
@@ -674,7 +674,6 @@ template <class E> constexpr const E* data(initializer_list<E> il) noexcept;
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__iterator/access.h>
 #include <__iterator/advance.h>
diff --git a/libcxx/include/latch b/libcxx/include/latch
index dd389d296f5c11..3fe201b63d1385 100644
--- a/libcxx/include/latch
+++ b/libcxx/include/latch
@@ -46,7 +46,7 @@ namespace std
 #  error "<latch> is not supported since libc++ has been configured without support for threads."
 #endif
 
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__atomic/atomic_base.h>
 #include <__atomic/atomic_sync.h>
 #include <__atomic/memory_order.h>
diff --git a/libcxx/include/limits b/libcxx/include/limits
index c704b4dddaf8e2..f15b5b1ab1d52f 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -102,7 +102,6 @@ template<> class numeric_limits<cv long double>;
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_signed.h>
diff --git a/libcxx/include/list b/libcxx/include/list
index 2705d4c9914d80..8f0689268e2a5a 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -202,7 +202,7 @@ template <class T, class Allocator, class Predicate>
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/min.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__config>
 #include <__format/enable_insertable.h>
diff --git a/libcxx/include/locale b/libcxx/include/locale
index 9e97eb9f339533..e3c63e3abe130e 100644
--- a/libcxx/include/locale
+++ b/libcxx/include/locale
@@ -193,7 +193,7 @@ template <class charT> class messages_byname;
 #include <__algorithm/max.h>
 #include <__algorithm/reverse.h>
 #include <__algorithm/unwrap_iter.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__config>
 #include <__iterator/access.h>
 #include <__iterator/back_insert_iterator.h>
diff --git a/libcxx/include/map b/libcxx/include/map
index a56584589f5c85..5b6ec9d3a21936 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -574,7 +574,7 @@ erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20
 #include <__algorithm/equal.h>
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__config>
 #include <__functional/binary_function.h>
diff --git a/libcxx/include/memory b/libcxx/include/memory
index 0ada7cdfa20690..a8c0264eb9eb78 100644
--- a/libcxx/include/memory
+++ b/libcxx/include/memory
@@ -917,7 +917,6 @@ template<size_t N, class T>
 
 // clang-format on
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__memory/addressof.h>
 #include <__memory/align.h>
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index e67135fc0ec04e..ea56e3051908a7 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -186,7 +186,6 @@ template<class Callable, class ...Args>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__chrono/steady_clock.h>
 #include <__chrono/time_point.h>
 #include <__condition_variable/condition_variable.h>
diff --git a/libcxx/include/new b/libcxx/include/new
index 86fbcb524b66d8..988f7a84422c84 100644
--- a/libcxx/include/new
+++ b/libcxx/include/new
@@ -86,13 +86,13 @@ void  operator delete[](void* ptr, void*) noexcept;
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__config>
 #include <__exception/exception.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/remove_cv.h>
+#include <__verbose_abort>
 #include <cstddef>
 #include <version>
 
diff --git a/libcxx/include/numbers b/libcxx/include/numbers
index 0d834c6b863f66..f48ba4baf38ffd 100644
--- a/libcxx/include/numbers
+++ b/libcxx/include/numbers
@@ -58,7 +58,6 @@ namespace std::numbers {
 }
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__concepts/arithmetic.h>
 #include <__config>
 #include <version>
diff --git a/libcxx/include/numeric b/libcxx/include/numeric
index 0fe7115f1c666e..8b429fa2f7e7d5 100644
--- a/libcxx/include/numeric
+++ b/libcxx/include/numeric
@@ -156,7 +156,6 @@ constexpr T saturate_cast(U x) noexcept;                    // freestanding, Sin
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <version>
 
diff --git a/libcxx/include/optional b/libcxx/include/optional
index 73da0a8a5a7c19..9e4f0fff2f4a7a 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -177,7 +177,7 @@ namespace std {
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__compare/compare_three_way_result.h>
 #include <__compare/three_way_comparable.h>
diff --git a/libcxx/include/ostream b/libcxx/include/ostream
index 2e2607340a5de1..42819ceb252c65 100644
--- a/libcxx/include/ostream
+++ b/libcxx/include/ostream
@@ -171,7 +171,6 @@ void vprint_nonunicode(ostream& os, string_view fmt, format_args args);
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__config>
 #include <__exception/operations.h>
diff --git a/libcxx/include/print b/libcxx/include/print
index 543a540ee4f27d..a9f10433a7dc61 100644
--- a/libcxx/include/print
+++ b/libcxx/include/print
@@ -31,7 +31,7 @@ namespace std {
 }
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__concepts/same_as.h>
 #include <__config>
diff --git a/libcxx/include/queue b/libcxx/include/queue
index 2263f71fde9073..521a465713cd22 100644
--- a/libcxx/include/queue
+++ b/libcxx/include/queue
@@ -258,7 +258,6 @@ template <class T, class Container, class Compare>
 #include <__algorithm/pop_heap.h>
 #include <__algorithm/push_heap.h>
 #include <__algorithm/ranges_copy.h>
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/back_insert_iterator.h>
diff --git a/libcxx/include/random b/libcxx/include/random
index 02d71ad6dd25c8..9edd6c4608ec26 100644
--- a/libcxx/include/random
+++ b/libcxx/include/random
@@ -1677,7 +1677,6 @@ class piecewise_linear_distribution
 } // std
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__random/bernoulli_distribution.h>
 #include <__random/binomial_distribution.h>
diff --git a/libcxx/include/ranges b/libcxx/include/ranges
index 660d533b2a7830..167d2137eaf454 100644
--- a/libcxx/include/ranges
+++ b/libcxx/include/ranges
@@ -375,7 +375,6 @@ namespace std {
 }
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__ranges/access.h>
 #include <__ranges/all.h>
diff --git a/libcxx/include/ratio b/libcxx/include/ratio
index de656f38e01de6..b989c272aaee6a 100644
--- a/libcxx/include/ratio
+++ b/libcxx/include/ratio
@@ -81,7 +81,6 @@ using quetta = ratio <1'000'000'000'000'000'000'000'000'000'000, 1>; // Since C+
 }
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__type_traits/integral_constant.h>
 #include <climits>
diff --git a/libcxx/include/regex b/libcxx/include/regex
index 48af5b8b57fd64..dc3db93744b489 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -791,7 +791,7 @@ typedef regex_token_iterator<wstring::const_iterator> wsregex_token_iterator;
 
 #include <__algorithm/find.h>
 #include <__algorithm/search.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__config>
 #include <__iterator/back_insert_iterator.h>
diff --git a/libcxx/include/scoped_allocator b/libcxx/include/scoped_allocator
index fa6c6c5d20d864..c53261025be9d7 100644
--- a/libcxx/include/scoped_allocator
+++ b/libcxx/include/scoped_allocator
@@ -109,7 +109,6 @@ template <class OuterA1, class OuterA2, class... InnerAllocs>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__memory/allocator_traits.h>
 #include <__memory/uses_allocator_construction.h>
diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore
index 448b5fbd8c58cf..2dfdae9aa148c1 100644
--- a/libcxx/include/semaphore
+++ b/libcxx/include/semaphore
@@ -51,7 +51,7 @@ using binary_semaphore = counting_semaphore<1>;
 #  error "<semaphore> is not supported since libc++ has been configured without support for threads."
 #endif
 
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__atomic/atomic_base.h>
 #include <__atomic/atomic_sync.h>
 #include <__atomic/memory_order.h>
diff --git a/libcxx/include/set b/libcxx/include/set
index 7f8245f8b605ab..e2e87e4cdcfe3b 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -515,7 +515,7 @@ erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred);  // C++20
 #include <__algorithm/equal.h>
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__config>
 #include <__functional/is_transparent.h>
diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex
index 57f385b5435eb2..38b559e8930fc5 100644
--- a/libcxx/include/shared_mutex
+++ b/libcxx/include/shared_mutex
@@ -128,7 +128,6 @@ template <class Mutex>
 #  error "<shared_mutex> is not supported since libc++ has been configured without support for threads."
 #endif
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__chrono/duration.h>
 #include <__chrono/steady_clock.h>
diff --git a/libcxx/include/span b/libcxx/include/span
index 32364b4270be9e..9efaac517fc8f6 100644
--- a/libcxx/include/span
+++ b/libcxx/include/span
@@ -128,7 +128,7 @@ template<class R>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__config>
 #include <__fwd/span.h>
 #include <__iterator/bounded_iter.h>
diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index 8862e2ef99f8da..60bec52209d75e 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -278,7 +278,6 @@ typedef basic_stringstream<wchar_t> wstringstream;
 
 // clang-format on
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__config>
 #include <__fwd/sstream.h>
diff --git a/libcxx/include/stack b/libcxx/include/stack
index 77f1a4e11b732d..4003792600a004 100644
--- a/libcxx/include/stack
+++ b/libcxx/include/stack
@@ -114,7 +114,6 @@ template <class T, class Container>
 */
 
 #include <__algorithm/ranges_copy.h>
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__iterator/back_insert_iterator.h>
 #include <__iterator/iterator_traits.h>
diff --git a/libcxx/include/stdexcept b/libcxx/include/stdexcept
index 3016c130a91b8f..4e4cd22a6a64d2 100644
--- a/libcxx/include/stdexcept
+++ b/libcxx/include/stdexcept
@@ -41,7 +41,6 @@ public:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__exception/exception.h>
 #include <__fwd/string.h>
diff --git a/libcxx/include/stop_token b/libcxx/include/stop_token
index 66c7a6ab5996c1..fee195f9d63d4b 100644
--- a/libcxx/include/stop_token
+++ b/libcxx/include/stop_token
@@ -37,7 +37,6 @@ namespace std {
 #  error "<stop_token> is not supported since libc++ has been configured without support for threads."
 #endif
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__stop_token/stop_callback.h>
 #include <__stop_token/stop_source.h>
 #include <__stop_token/stop_token.h>
diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf
index aad7686a435cbc..aec537866c2031 100644
--- a/libcxx/include/streambuf
+++ b/libcxx/include/streambuf
@@ -107,7 +107,6 @@ protected:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__fwd/streambuf.h>
 #include <__type_traits/is_same.h>
diff --git a/libcxx/include/string b/libcxx/include/string
index 530a2233860434..ca5b3fa6a01472 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -572,7 +572,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #include <__algorithm/min.h>
 #include <__algorithm/remove.h>
 #include <__algorithm/remove_if.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__config>
 #include <__format/enable_insertable.h>
 #include <__functional/hash.h>
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index e414507a7933b6..48bbcd80021670 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -206,7 +206,7 @@ namespace std {
 // clang-format on
 
 #include <__algorithm/min.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__config>
 #include <__functional/hash.h>
 #include <__functional/unary_function.h>
diff --git a/libcxx/include/strstream b/libcxx/include/strstream
index e20c86baa6dfc5..e9f533644f78cf 100644
--- a/libcxx/include/strstream
+++ b/libcxx/include/strstream
@@ -129,7 +129,6 @@ private:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <istream>
 #include <ostream>
diff --git a/libcxx/include/system_error b/libcxx/include/system_error
index a60c98492aaced..eeab347788a9a5 100644
--- a/libcxx/include/system_error
+++ b/libcxx/include/system_error
@@ -144,7 +144,6 @@ template <> struct hash<std::error_condition>;
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__system_error/errc.h>
 #include <__system_error/error_category.h>
diff --git a/libcxx/include/thread b/libcxx/include/thread
index 29c7e86785cde4..ed70bde76094ae 100644
--- a/libcxx/include/thread
+++ b/libcxx/include/thread
@@ -92,7 +92,6 @@ void sleep_for(const chrono::duration<Rep, Period>& rel_time);
 #  error "<thread> is not supported since libc++ has been configured without support for threads."
 #endif
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__thread/formatter.h>
 #include <__thread/jthread.h>
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 96cf3be85b76f2..0101d64aea4a72 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -205,7 +205,6 @@ template <class... Types>
 
 // clang-format on
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__compare/common_comparison_category.h>
 #include <__compare/synth_three_way.h>
 #include <__config>
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index 466aeb6e0ddd71..0037c426560e6f 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -416,7 +416,7 @@ namespace std
 }
 
 */
-#include <__assert> // all public C++ headers provide the assertion handler
+
 #include <__config>
 #include <__fwd/hash.h> // This is https://llvm.org/PR56938
 #include <__type_traits/add_const.h>
diff --git a/libcxx/include/typeindex b/libcxx/include/typeindex
index e6ea12afd52450..6398aa40d616a7 100644
--- a/libcxx/include/typeindex
+++ b/libcxx/include/typeindex
@@ -45,7 +45,6 @@ struct hash<type_index>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__functional/unary_function.h>
 #include <typeinfo>
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 1144b5b12913e1..dafc7b89248eca 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -56,7 +56,6 @@ public:
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__config>
 #include <__exception/exception.h>
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index d2a3b769821d84..ca3d1a80bd578d 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -584,7 +584,7 @@ template <class Key, class T, class Hash, class Pred, class Alloc>
 */
 
 #include <__algorithm/is_permutation.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__config>
 #include <__functional/is_transparent.h>
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index 50b616907f0052..64a02de3cf55d4 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -532,7 +532,7 @@ template <class Value, class Hash, class Pred, class Alloc>
 // clang-format on
 
 #include <__algorithm/is_permutation.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__config>
 #include <__functional/is_transparent.h>
diff --git a/libcxx/include/utility b/libcxx/include/utility
index 1deef3db204107..90713da621c5da 100644
--- a/libcxx/include/utility
+++ b/libcxx/include/utility
@@ -246,7 +246,6 @@ template <class T>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__utility/as_const.h>
 #include <__utility/as_lvalue.h>
diff --git a/libcxx/include/valarray b/libcxx/include/valarray
index 88b161eccd332f..3d45925a25bef8 100644
--- a/libcxx/include/valarray
+++ b/libcxx/include/valarray
@@ -350,7 +350,7 @@ template <class T> unspecified2 end(const valarray<T>& v);
 #include <__algorithm/min.h>
 #include <__algorithm/min_element.h>
 #include <__algorithm/unwrap_iter.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__config>
 #include <__functional/operations.h>
 #include <__memory/addressof.h>
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 6063739e52c86b..5ce99250a8b4f4 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -210,7 +210,6 @@ namespace std {
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__compare/common_comparison_category.h>
 #include <__compare/compare_three_way_result.h>
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 579fadfb404c19..89cbdf0b3ff747 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -315,7 +315,7 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__algorithm/remove_if.h>
 #include <__algorithm/rotate.h>
 #include <__algorithm/unwrap_iter.h>
-#include <__assert> // all public C++ headers provide the assertion handler
+#include <__assert>
 #include <__availability>
 #include <__bit_reference>
 #include <__concepts/same_as.h>
diff --git a/libcxx/include/version b/libcxx/include/version
index b18927a2bc38c2..cd180441c5b9e1 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -244,7 +244,6 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__config>
 
diff --git a/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp b/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp
index 585ab73f2cb261..9298a1e365fca4 100644
--- a/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp
+++ b/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp
@@ -12,6 +12,7 @@
 // failures when back-deploying.
 // XFAIL: availability-verbose_abort-missing
 
+#include <__verbose_abort>
 #include <cstdlib>
 
 void std::__libcpp_verbose_abort(char const*, ...) {
diff --git a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
deleted file mode 100644
index bd883aa0c14502..00000000000000
--- a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
+++ /dev/null
@@ -1,33 +0,0 @@
-#===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-#===----------------------------------------------------------------------===##
-
-# Test that all public C++ headers define the verbose termination function, which
-# is required for users to be able to include any public header and then override
-# the function using a strong definition.
-
-# RUN: %{python} %s %{libcxx-dir}/utils
-
-import sys
-sys.path.append(sys.argv[1])
-from libcxx.header_information import lit_header_restrictions, public_headers
-
-for header in public_headers:
-    # Skip C compatibility headers.
-    if header.endswith('.h'):
-        continue
-
-    BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
-    print(f"""\
-//--- {header}.compile.pass.cpp
-{lit_header_restrictions.get(header, '')}
-
-// XFAIL{BLOCKLIT}: availability-verbose_abort-missing
-
-#include <{header}>
-using HandlerType = decltype(std::__libcpp_verbose_abort);
-""")
diff --git a/libcxx/test/libcxx/assertions/modes/none.pass.cpp b/libcxx/test/libcxx/assertions/modes/none.pass.cpp
index 4644c5692e70be..8332848c1a8e03 100644
--- a/libcxx/test/libcxx/assertions/modes/none.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/none.pass.cpp
@@ -11,6 +11,7 @@
 
 // REQUIRES: libcpp-hardening-mode=none
 
+#include <__assert>
 #include <cassert>
 
 bool executed_condition = false;
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index b688a30cdb792d..7b6d35d9a7fc53 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1514,7 +1514,6 @@ def produce_version_header():
 
 */
 
-#include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
 #include <__config>
 

From 92e5f13ad19ace968309e31146e00e76b87c5a2d Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh@arm.com>
Date: Thu, 29 Feb 2024 13:29:35 +0000
Subject: [PATCH 28/55] [AArch64][GlobalISel] Legalize G_SHUFFLE_VECTOR for
 Odd-Sized Vectors (#83038)

---
 llvm/test/CodeGen/AArch64/shufflevector.ll | 565 +++++++++++++++++++++
 1 file changed, 565 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/shufflevector.ll

diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
new file mode 100644
index 00000000000000..df59eb8e629f44
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -0,0 +1,565 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:         warning: Instruction selection used fallback path for shufflevector_v2i1
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v4i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v32i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v2i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v16i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v2i1_zeroes
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v4i8_zeroes
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v32i8_zeroes
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v2i16_zeroes
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v16i16_zeroes
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v3i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v3i8_zeroes
+
+; ===== Legal Vector Types =====
+
+define <8 x i8> @shufflevector_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-SD-LABEL: shufflevector_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    adrp x8, .LCPI0_0
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-SD-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    adrp x8, .LCPI0_0
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %c = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
+    ret <8 x i8> %c
+}
+
+define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: shufflevector_v16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI1_0
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI1_0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 2, i32 4, i32 6, i32 8, i32 25, i32 30, i32 31, i32 31>
+    ret <16 x i8> %c
+}
+
+define <4 x i16> @shufflevector_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: shufflevector_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+    ret <4 x i16> %c
+}
+
+define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SD-LABEL: shufflevector_v8i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI3_0
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI3_0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
+    ret <8 x i16> %c
+}
+
+define <2 x i32> @shufflevector_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: shufflevector_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip2 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+    ret <2 x i32> %c
+}
+
+define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+    ret <4 x i32> %c
+}
+
+define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip2 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+    ret <2 x i64> %c
+}
+
+; ===== Legal Vector Types with Zero Masks =====
+
+define <8 x i8> @shufflevector_v8i8_zeroes(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: shufflevector_v8i8_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.8b, v0.b[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+    ret <8 x i8> %c
+}
+
+define <16 x i8> @shufflevector_v16i8_zeroes(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_v16i8_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.16b, v0.b[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+    ret <16 x i8> %c
+}
+
+define <4 x i16> @shufflevector_v4i16_zeroes(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: shufflevector_v4i16_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i16> %c
+}
+
+define <8 x i16> @shufflevector_v8i16_zeroes(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_v8i16_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+    ret <8 x i16> %c
+}
+
+define <2 x i32> @shufflevector_v2i32_zeroes(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: shufflevector_v2i32_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 0>
+    ret <2 x i32> %c
+}
+
+define <4 x i32> @shufflevector_v4i32_zeroes(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_v4i32_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i32> %c
+}
+
+define <2 x i64> @shufflevector_v2i64_zeroes(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_v2i64_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0>
+    ret <2 x i64> %c
+}
+
+; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
+
+define <2 x i1> @shufflevector_v2i1(<2 x i1> %a, <2 x i1> %b){
+; CHECK-LABEL: shufflevector_v2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i1> %a, <2 x i1> %b, <2 x i32> <i32 0, i32 3>
+    ret <2 x i1> %c
+}
+
+define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
+; CHECK-LABEL: shufflevector_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ext v0.8b, v1.8b, v0.8b, #6
+; CHECK-NEXT:    zip1 v1.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 7>
+    %d = bitcast <4 x i8> %c to i32
+    ret i32 %d
+}
+
+define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b){
+; CHECK-LABEL: shufflevector_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q2 killed $q2 def $q1_q2
+; CHECK-NEXT:    adrp x8, .LCPI16_0
+; CHECK-NEXT:    adrp x9, .LCPI16_1
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT:    ldr q4, [x9, :lo12:.LCPI16_1]
+; CHECK-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v3.16b
+; CHECK-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v4.16b
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 4, i32 32, i32 32, i32 32, i32 5, i32 32, i32 32, i32 32, i32 6, i32 32, i32 32, i32 32, i32 7, i32 32, i32 32, i32 32>
+    ret <32 x i8> %c
+}
+
+define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
+; CHECK-LABEL: shufflevector_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    ldr w0, [sp, #12]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 1, i32 2>
+    %d = bitcast <2 x i16> %c to i32
+    ret i32 %d
+}
+
+define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b){
+; CHECK-LABEL: shufflevector_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q2 killed $q2 def $q1_q2
+; CHECK-NEXT:    adrp x8, .LCPI18_0
+; CHECK-NEXT:    adrp x9, .LCPI18_1
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    ldr q4, [x9, :lo12:.LCPI18_1]
+; CHECK-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v3.16b
+; CHECK-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v4.16b
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16>
+    ret <16 x i16> %c
+}
+
+define <1 x i32> @shufflevector_v1i32(<1 x i32> %a, <1 x i32> %b) {
+; CHECK-LABEL: shufflevector_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+    %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> <i32 1>
+    ret <1 x i32> %c
+}
+
+define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-SD-LABEL: shufflevector_v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov v2.s[3], v3.s[3]
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI20_0
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI20_0]
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v4.16b
+; CHECK-GI-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
+    ret <8 x i32> %c
+}
+
+define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-SD-LABEL: shufflevector_v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    zip2 v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    zip2 v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    zip2 v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    zip2 v1.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+    ret <4 x i64> %c
+}
+
+; ===== Smaller/Larger Width Vectors with Zero Masks =====
+
+define <2 x i1> @shufflevector_v2i1_zeroes(<2 x i1> %a, <2 x i1> %b){
+; CHECK-LABEL: shufflevector_v2i1_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i1> %a, <2 x i1> %b, <2 x i32> <i32 0, i32 0>
+    ret <2 x i1> %c
+}
+
+define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
+; CHECK-LABEL: shufflevector_v4i8_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+    %d = bitcast <4 x i8> %c to i32
+    ret i32 %d
+}
+
+define <32 x i8> @shufflevector_v32i8_zeroes(<32 x i8> %a, <32 x i8> %b){
+; CHECK-LABEL: shufflevector_v32i8_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.16b, v0.b[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+    ret <32 x i8> %c
+}
+
+define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
+; CHECK-LABEL: shufflevector_v2i16_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v1.2s, v0.s[0]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    ldr w0, [sp, #12]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 0, i32 0>
+    %d = bitcast <2 x i16> %c to i32
+    ret i32 %d
+}
+
+define <16 x i16> @shufflevector_v16i16_zeroes(<16 x i16> %a, <16 x i16> %b){
+; CHECK-LABEL: shufflevector_v16i16_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+    ret <16 x i16> %c
+}
+
+define <1 x i32> @shufflevector_v1i32_zeroes(<1 x i32> %a, <1 x i32> %b) {
+; CHECK-LABEL: shufflevector_v1i32_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+    %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> <i32 0>
+    ret <1 x i32> %c
+}
+
+define <8 x i32> @shufflevector_v8i32_zeroes(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_v8i32_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+    ret <8 x i32> %c
+}
+
+define <4 x i64> @shufflevector_v4i64_zeroes(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_v4i64_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i64> %c
+}
+
+; ===== Vectors with Non-Pow 2 Widths =====
+
+define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
+; CHECK-LABEL: shufflevector_v3i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    mov w1, w2
+; CHECK-NEXT:    mov w2, w4
+; CHECK-NEXT:    ret
+    %c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> <i32 1, i32 2, i32 4>
+    ret <3 x i8> %c
+}
+
+define <7 x i8> @shufflevector_v7i8(<7 x i8> %a, <7 x i8> %b) {
+; CHECK-SD-LABEL: shufflevector_v7i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    adrp x8, .LCPI31_0
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    ldr d1, [x8, :lo12:.LCPI31_0]
+; CHECK-SD-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v7i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    adrp x8, .LCPI31_0
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI31_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
+    ret <7 x i8> %c
+}
+
+define <3 x i16> @shufflevector_v3i16(<3 x i16> %a, <3 x i16> %b) {
+; CHECK-SD-LABEL: shufflevector_v3i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    zip1 v1.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    zip2 v0.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v3i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    adrp x8, .LCPI32_0
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI32_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> <i32 1, i32 2, i32 4>
+    ret <3 x i16> %c
+}
+
+define <7 x i16> @shufflevector_v7i16(<7 x i16> %a, <7 x i16> %b) {
+; CHECK-SD-LABEL: shufflevector_v7i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI33_0
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI33_0]
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v7i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI33_0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    ret
+    %c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
+    ret <7 x i16> %c
+}
+
+define <3 x i32> @shufflevector_v3i32(<3 x i32> %a, <3 x i32> %b) {
+; CHECK-SD-LABEL: shufflevector_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    zip1 v1.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    zip2 v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    ret
+    %c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> <i32 1, i32 2, i32 4>
+    ret <3 x i32> %c
+}
+
+; ===== Vectors with Non-Pow 2 Widths with Zero Masks =====
+
+define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
+; CHECK-LABEL: shufflevector_v3i8_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w1, w0
+; CHECK-NEXT:    mov w2, w0
+; CHECK-NEXT:    ret
+    %c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> <i32 0, i32 0, i32 0>
+    ret <3 x i8> %c
+}
+
+define <7 x i8> @shufflevector_v7i8_zeroes(<7 x i8> %a, <7 x i8> %b) {
+; CHECK-LABEL: shufflevector_v7i8_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.8b, v0.b[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+    ret <7 x i8> %c
+}
+
+define <3 x i16> @shufflevector_v3i16_zeroes(<3 x i16> %a, <3 x i16> %b) {
+; CHECK-LABEL: shufflevector_v3i16_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> <i32 0, i32 0, i32 0>
+    ret <3 x i16> %c
+}
+
+define <7 x i16> @shufflevector_v7i16_zeroes(<7 x i16> %a, <7 x i16> %b) {
+; CHECK-LABEL: shufflevector_v7i16_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+    ret <7 x i16> %c
+}
+
+define <3 x i32> @shufflevector_v3i32_zeroes(<3 x i32> %a, <3 x i32> %b) {
+; CHECK-LABEL: shufflevector_v3i32_zeroes:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+    %c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> <i32 0, i32 0, i32 0>
+    ret <3 x i32> %c
+}

From 80a328b0118faa691137d6325b0eed4199060adc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 29 Feb 2024 15:05:46 +0000
Subject: [PATCH 29/55] [X86] SimplifyDemandedVectorEltsForTargetNode - add
 basic PCMPEQ/PCMPGT handling

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 14 ++++++++++
 .../CodeGen/X86/horizontal-reduce-umax.ll     |  2 +-
 .../CodeGen/X86/horizontal-reduce-umin.ll     |  2 +-
 .../CodeGen/X86/srem-seteq-illegal-types.ll   | 12 ++++++---
 llvm/test/CodeGen/X86/test-shrink-bug.ll      |  2 +-
 .../CodeGen/X86/urem-seteq-vec-nonzero.ll     |  1 -
 .../X86/urem-seteq-vec-tautological.ll        | 24 ++++++-----------
 llvm/test/CodeGen/X86/vector-reduce-umax.ll   |  2 +-
 llvm/test/CodeGen/X86/vector-reduce-umin.ll   |  2 +-
 llvm/test/CodeGen/X86/vselect.ll              | 26 +++++++++++++------
 10 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d98d914894a3f8..b807a97d6e4851 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41262,6 +41262,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     KnownZero = LHSZero;
     break;
   }
+  case X86ISD::PCMPEQ:
+  case X86ISD::PCMPGT: {
+    APInt LHSUndef, LHSZero;
+    APInt RHSUndef, RHSZero;
+    SDValue LHS = Op.getOperand(0);
+    SDValue RHS = Op.getOperand(1);
+    if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
+                                   Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
+                                   Depth + 1))
+      return true;
+    break;
+  }
   case X86ISD::KSHIFTL: {
     SDValue Src = Op.getOperand(0);
     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
index 5fde9bd5566b40..9946267b48e7ff 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -635,7 +635,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-AVX2-LABEL: test_reduce_v4i64:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; X64-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm4
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
index 699dce75e505c7..0bbf94f1817f51 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -581,7 +581,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-AVX2-LABEL: test_reduce_v4i64:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; X64-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm4
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index fdb2f41ec0e498..d644ed87c3c108 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -267,11 +267,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; SSE41-NEXT:    pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    movl $3, %eax
+; SSE41-NEXT:    movq %rax, %xmm3
+; SSE41-NEXT:    pcmpeqq %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm3
 ; SSE41-NEXT:    movd %xmm0, %eax
 ; SSE41-NEXT:    pextrb $8, %xmm0, %edx
-; SSE41-NEXT:    pextrb $0, %xmm2, %ecx
+; SSE41-NEXT:    pextrb $0, %xmm3, %ecx
 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE41-NEXT:    # kill: def $dl killed $dl killed $edx
 ; SSE41-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -318,7 +320,9 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    movl $3, %eax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll
index 74c5179b187896..ed43cabbdaee11 100644
--- a/llvm/test/CodeGen/X86/test-shrink-bug.ll
+++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll
@@ -67,7 +67,7 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) {
 ; CHECK-X64-NEXT:    testl $263, %edi # imm = 0x107
 ; CHECK-X64-NEXT:    je .LBB1_3
 ; CHECK-X64-NEXT:  # %bb.1:
-; CHECK-X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-X64-NEXT:    pslld $8, %xmm0
 ; CHECK-X64-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-X64-NEXT:    pextrw $1, %xmm0, %eax
 ; CHECK-X64-NEXT:    testb $1, %al
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
index a15de8b8e0f6ad..6a36cd2a86d5cd 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
@@ -264,7 +264,6 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
index 6d99bedd40b91c..cdeca96732dc31 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
@@ -25,13 +25,7 @@ define <4 x i1> @t0_all_tautological(<4 x i32> %X) nounwind {
 define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: t1_all_odd_eq:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -82,13 +76,7 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
 define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: t1_all_odd_ne:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -256,7 +244,9 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    movabsq $-3074457345618258603, %rax # imm = 0xD555555555555555
+; CHECK-AVX1-NEXT:    vmovq %rax, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
@@ -273,7 +263,9 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; CHECK-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    movabsq $-3074457345618258603, %rax # imm = 0xD555555555555555
+; CHECK-AVX2-NEXT:    vmovq %rax, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
index 4799b8e7e5857b..3b25a6e033f2fd 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -210,7 +210,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-LABEL: test_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm4
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
index 75eeec456c9ac3..2d68cf9d6374d7 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
@@ -211,7 +211,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-LABEL: test_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm4
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index ce3dc8cc873cc7..cc4eb0c8f7343b 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -741,14 +741,24 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
 ; SSE-NEXT:    shll $15, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: vselect_any_extend_vector_inreg_crash:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    andl $1, %eax
-; AVX-NEXT:    shll $15, %eax
-; AVX-NEXT:    retq
+; AVX1-LABEL: vselect_any_extend_vector_inreg_crash:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    shll $15, %eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vselect_any_extend_vector_inreg_crash:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [49,49,49,49]
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    shll $15, %eax
+; AVX2-NEXT:    retq
 0:
   %1 = load <8 x i8>, ptr %x
   %2 = icmp eq <8 x i8> %1, <i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49>

From 5bd01ac822d1d700623790ef146fb78216576616 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Thu, 29 Feb 2024 15:35:46 +0000
Subject: [PATCH 30/55] [AArch64] Re-enable rematerialization for
 streaming-mode-changing functions. (#83235)

We can add implicit defs/uses of the 'VG' register to the instructions
to prevent the register allocator from rematerializing values in between
streaming-mode changes, as the def/use of VG will further nail down the
ordering that comes out of ISel. This avoids the heavy-handed approach
to prevent any kind of rematerialization.

While we could add 'VG' as a Use to all SVE instructions, we only really
need to do this for instructions that are rematerializable, as the
smstart/smstop instructions and pseudos act as scheduling barriers which
is sufficient to prevent other instructions from being scheduled in
between the streaming-mode-changing call sequence. However, we may
revisit this in the future.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 16 ++++++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  1 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 52 -------------------
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |  2 -
 .../Target/AArch64/AArch64RegisterInfo.cpp    |  3 ++
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  2 +
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  2 +
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 10 ++++
 .../AArch64/debug-info-sve-dbg-declare.mir    |  2 +-
 .../AArch64/debug-info-sve-dbg-value.mir      |  2 +-
 .../CodeGen/AArch64/live-debugvalues-sve.mir  |  4 +-
 .../CodeGen/AArch64/sve-localstackalloc.mir   |  2 +-
 .../AArch64/sve-pfalse-machine-cse.mir        |  6 +--
 .../AArch64/sve-pseudos-expand-undef.mir      |  4 +-
 .../AArch64/sve-ptest-removal-cmpeq.mir       | 10 ++--
 .../AArch64/sve-ptest-removal-whilege.mir     | 24 ++++-----
 .../AArch64/sve-ptest-removal-whilegt.mir     | 24 ++++-----
 .../AArch64/sve-ptest-removal-whilehi.mir     | 24 ++++-----
 .../AArch64/sve-ptest-removal-whilehs.mir     | 24 ++++-----
 .../AArch64/sve-ptest-removal-whilele.mir     | 24 ++++-----
 .../AArch64/sve-ptest-removal-whilelo.mir     | 24 ++++-----
 .../AArch64/sve-ptest-removal-whilels.mir     | 24 ++++-----
 .../AArch64/sve-ptest-removal-whilelt.mir     | 24 ++++-----
 .../AArch64/sve-ptest-removal-whilerw.mir     | 16 +++---
 .../AArch64/sve-ptest-removal-whilewr.mir     | 16 +++---
 llvm/test/CodeGen/AArch64/sve2p1_copy_pnr.mir | 12 ++---
 26 files changed, 167 insertions(+), 187 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c21bc3a4abbc00..b1677df56e1bea 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7518,6 +7518,22 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
           (AArch64::GPR32RegClass.contains(MO.getReg()) ||
            AArch64::GPR64RegClass.contains(MO.getReg())))
         MI.removeOperand(I);
+
+  // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
+  // have nothing to do with VG, were it not that they are used to materialise a
+  // frame-address. If they contain a frame-index to a scalable vector, this
+  // will likely require an ADDVL instruction to materialise the address, thus
+  // reading VG.
+  const MachineFunction &MF = *MI.getMF();
+  if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
+      (MI.getOpcode() == AArch64::ADDXri ||
+       MI.getOpcode() == AArch64::SUBXri)) {
+    const MachineOperand &MO = MI.getOperand(1);
+    if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
+                         TargetStackID::ScalableVector)
+      MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
+                                              /*IsImplicit=*/true));
+  }
 }
 
 SDValue AArch64TargetLowering::changeStreamingMode(
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 10ad5b1f8f2580..7f8856db6c6e61 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -2809,6 +2809,7 @@ class AddSubImmShift<bit isSub, bit setFlags, RegisterClass dstRegtype,
   let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
   let Inst{21-10} = imm{11-0};
   let DecoderMethod = "DecodeAddSubImmShift";
+  let hasPostISelHook = 1;
 }
 
 class BaseAddSubRegPseudo<RegisterClass regtype,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 39c96092f10319..17e0e36ee6821e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -9481,58 +9481,6 @@ unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
     return AArch64::BLR;
 }
 
-bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
-    const MachineInstr &MI) const {
-  const MachineFunction &MF = *MI.getMF();
-  const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
-
-  // If the function contains changes to streaming mode, then there
-  // is a danger that rematerialised instructions end up between
-  // instruction sequences (e.g. call sequences, or prolog/epilogue)
-  // where the streaming-SVE mode is temporarily changed.
-  if (AFI.hasStreamingModeChanges()) {
-    // Avoid rematerializing rematerializable instructions that use/define
-    // scalable values, such as 'pfalse' or 'ptrue', which result in different
-    // results when the runtime vector length is different.
-    const MachineRegisterInfo &MRI = MF.getRegInfo();
-    const MachineFrameInfo &MFI = MF.getFrameInfo();
-    if (any_of(MI.operands(), [&MRI, &MFI](const MachineOperand &MO) {
-          if (MO.isFI() &&
-              MFI.getStackID(MO.getIndex()) == TargetStackID::ScalableVector)
-            return true;
-          if (!MO.isReg())
-            return false;
-
-          if (MO.getReg().isVirtual()) {
-            const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
-            return AArch64::ZPRRegClass.hasSubClassEq(RC) ||
-                   AArch64::PPRRegClass.hasSubClassEq(RC);
-          }
-          return AArch64::ZPRRegClass.contains(MO.getReg()) ||
-                 AArch64::PPRRegClass.contains(MO.getReg());
-        }))
-      return false;
-
-    // Avoid rematerializing instructions that return a value that is
-    // different depending on vector length, even when it is not returned
-    // in a scalable vector/predicate register.
-    switch (MI.getOpcode()) {
-    default:
-      break;
-    case AArch64::RDVLI_XI:
-    case AArch64::ADDVL_XXI:
-    case AArch64::ADDPL_XXI:
-    case AArch64::CNTB_XPiI:
-    case AArch64::CNTH_XPiI:
-    case AArch64::CNTW_XPiI:
-    case AArch64::CNTD_XPiI:
-      return false;
-    }
-  }
-
-  return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
-}
-
 MachineBasicBlock::iterator
 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
                                    Register TargetReg, bool FrameSetup) const {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 63e0cb80d8586f..6c6689091ead4d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -381,8 +381,6 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
                                                   int64_t &ByteSized,
                                                   int64_t &VGSized);
 
-  bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
-
   // Return true if address of the form BaseReg + Scale * ScaledReg + Offset can
   // be used for a load/store of NumBytes. BaseReg is always present and
   // implicit.
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index b919c116445c8b..531f21f9c043a2 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -443,6 +443,9 @@ AArch64RegisterInfo::getStrictlyReservedRegs(const MachineFunction &MF) const {
       Reserved.set(SubReg);
   }
 
+  // VG cannot be allocated
+  Reserved.set(AArch64::VG);
+
   if (MF.getSubtarget<AArch64Subtarget>().hasSME2()) {
     for (MCSubRegIterator SubReg(AArch64::ZT0, this, /*self=*/true);
          SubReg.isValid(); ++SubReg)
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index acf067f2cc5a9d..2907ba74ff8108 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -233,6 +233,8 @@ def MSRpstatePseudo :
            (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>,
     Sched<[WriteSys]> {
   let hasPostISelHook = 1;
+  let Uses = [VG];
+  let Defs = [VG];
 }
 
 def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 44d9a8ac7cb677..33cb5f9734b819 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -223,6 +223,8 @@ def MSRpstatesvcrImm1
   let Inst{8} = imm;
   let Inst{7-5} = 0b011; // op2
   let hasPostISelHook = 1;
+  let Uses = [VG];
+  let Defs = [VG];
 }
 
 def : InstAlias<"smstart",    (MSRpstatesvcrImm1 0b011, 0b1)>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 789ec817d3d8b8..c8ca1832ec18dd 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -365,6 +365,7 @@ class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
   let ElementSize = pprty.ElementSize;
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
@@ -755,6 +756,7 @@ class sve_int_pfalse<bits<6> opc, string asm>
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_pfalse<bits<6> opc, string asm> {
@@ -1090,6 +1092,7 @@ class sve_int_count<bits<3> opc, string asm>
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_count<bits<3> opc, string asm, SDPatternOperator op> {
@@ -1982,6 +1985,7 @@ class sve_int_dup_mask_imm<string asm>
   let DecoderMethod = "DecodeSVELogicalImmInstruction";
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_dup_mask_imm<string asm> {
@@ -2862,6 +2866,7 @@ class sve_int_arith_vl<bit opc, string asm, bit streaming_sve = 0b0>
   let Inst{4-0}   = Rd;
 
   let hasSideEffects = 0;
+  let Uses = [VG];
 }
 
 class sve_int_read_vl_a<bit op, bits<5> opc2, string asm, bit streaming_sve = 0b0>
@@ -2882,6 +2887,7 @@ class sve_int_read_vl_a<bit op, bits<5> opc2, string asm, bit streaming_sve = 0b
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 //===----------------------------------------------------------------------===//
@@ -4699,6 +4705,7 @@ class sve_int_dup_imm<bits<2> sz8_64, string asm,
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_dup_imm<string asm> {
@@ -4741,6 +4748,7 @@ class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype,
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_dup_fpimm<string asm> {
@@ -5657,6 +5665,7 @@ class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_index_ii<string asm> {
@@ -9308,6 +9317,7 @@ class sve2p1_ptrue_pn<string mnemonic, bits<2> sz, PNRP8to15RegOp pnrty, SDPatte
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
index d44d45ea03b6bc..aca2816225e3ee 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
@@ -193,7 +193,7 @@ body:             |
     liveins: $z0, $z1, $p0, $p1, $w0
 
     renamable $p2 = COPY killed $p0
-    renamable $p0 = PTRUE_S 31
+    renamable $p0 = PTRUE_S 31, implicit $vg
     ST1W_IMM killed renamable $z0, renamable $p0, %stack.0.z0.addr, 0 :: (store unknown-size into %ir.z0.addr, align 16)
     ST1W_IMM killed renamable $z1, renamable $p0, %stack.1.z1.addr, 0 :: (store unknown-size into %ir.z1.addr, align 16)
     STR_PXI killed renamable $p2, %stack.2.p0.addr, 0 :: (store unknown-size into %ir.p0.addr, align 2)
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
index 75917ef32ae2ad..0ea180b20730f2 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
@@ -111,7 +111,7 @@ body:             |
     STRXui killed renamable $x1, %stack.1, 0, debug-location !8
     DBG_VALUE %stack.1, $noreg, !11, !DIExpression(DW_OP_constu, 16, DW_OP_plus, DW_OP_deref), debug-location !8
 
-    renamable $p2 = PTRUE_S 31, debug-location !DILocation(line: 4, column: 1, scope: !5)
+    renamable $p2 = PTRUE_S 31, implicit $vg, debug-location !DILocation(line: 4, column: 1, scope: !5)
     ST1W_IMM renamable $z0, renamable $p2, %stack.2, 0, debug-location !DILocation(line: 5, column: 1, scope: !5)
     DBG_VALUE %stack.2, $noreg, !12, !DIExpression(DW_OP_deref), debug-location !DILocation(line: 5, column: 1, scope: !5)
     ST1W_IMM renamable $z1, killed renamable $p2, %stack.3, 0, debug-location !DILocation(line: 6, column: 1, scope: !5)
diff --git a/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir b/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir
index 8903ca2b865b98..612453ab53f438 100644
--- a/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir
+++ b/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir
@@ -145,7 +145,7 @@ body:             |
     liveins: $z1
 
     ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp, debug-location !34
-    renamable $p0 = PTRUE_S 31, debug-location !34
+    renamable $p0 = PTRUE_S 31, implicit $vg, debug-location !34
     $x0 = ADDXri %stack.0, 0, 0, debug-location !34
     ST1W_IMM renamable $z1, killed renamable $p0, %stack.0, 0, debug-location !34 :: (store unknown-size into %stack.0, align 16)
     $z0 = COPY renamable $z1, debug-location !34
@@ -157,7 +157,7 @@ body:             |
     $z7 = COPY renamable $z1, debug-location !34
     BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7, implicit $x0, implicit-def $sp, implicit-def $z0, implicit-def $z1, debug-location !34
     ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp, debug-location !34
-    renamable $p0 = PTRUE_S 31, debug-location !34
+    renamable $p0 = PTRUE_S 31, implicit $vg, debug-location !34
     $z3 = IMPLICIT_DEF
     renamable $z1 = LD1W_IMM renamable $p0, %stack.0, 0, debug-location !34 :: (load unknown-size from %stack.0, align 16)
     ST1W_IMM renamable $z3, killed renamable $p0, %stack.0, 0 :: (store unknown-size into %stack.0, align 16)
diff --git a/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir
index 3fbb7889c8b79b..6063c8dfc792c9 100644
--- a/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir
+++ b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir
@@ -48,7 +48,7 @@ body:             |
     %2:gpr32 = COPY $w0
     %1:zpr = COPY $z1
     %0:zpr = COPY $z0
-    %5:ppr_3b = PTRUE_B 31
+    %5:ppr_3b = PTRUE_B 31, implicit $vg
     %6:gpr64sp = ADDXri %stack.0, 0, 0
     ST1B_IMM %1, %5, %6, 1 :: (store unknown-size, align 16)
     ST1B_IMM %0, %5, %stack.0, 0 :: (store unknown-size into %stack.0, align 16)
diff --git a/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir b/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir
index b76fe7821b6c69..8395a7619fbb46 100644
--- a/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir
+++ b/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir
@@ -11,15 +11,15 @@ body:             |
     ; CHECK: liveins: $p0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ppr = COPY $p0
-    ; CHECK-NEXT: [[PFALSE:%[0-9]+]]:ppr = PFALSE
+    ; CHECK-NEXT: [[PFALSE:%[0-9]+]]:ppr = PFALSE implicit $vg
     ; CHECK-NEXT: [[UZP1_PPP_B:%[0-9]+]]:ppr = UZP1_PPP_B [[COPY]], [[PFALSE]]
     ; CHECK-NEXT: [[UZP1_PPP_B1:%[0-9]+]]:ppr = UZP1_PPP_B killed [[UZP1_PPP_B]], [[PFALSE]]
     ; CHECK-NEXT: $p0 = COPY [[UZP1_PPP_B1]]
     ; CHECK-NEXT: RET_ReallyLR implicit $p0
     %0:ppr = COPY $p0
-    %2:ppr = PFALSE
+    %2:ppr = PFALSE implicit $vg
     %3:ppr = UZP1_PPP_B %0, %2
-    %4:ppr = PFALSE
+    %4:ppr = PFALSE implicit $vg
     %5:ppr = UZP1_PPP_B killed %3, %4
     $p0 = COPY %5
     RET_ReallyLR implicit $p0
diff --git a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
index df0e50de4d1a7a..ae70f91a4ec641 100644
--- a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
+++ b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
@@ -26,7 +26,7 @@ body:             |
 name: expand_mls_to_msb
 body:             |
   bb.0:
-    renamable $p0 = PTRUE_B 31
+    renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLS_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
@@ -36,7 +36,7 @@ body:             |
 name: expand_mla_to_mad
 body:             |
   bb.0:
-    renamable $p0 = PTRUE_B 31
+    renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir
index 81318aa5c2a58d..5169113697dc60 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir
@@ -174,7 +174,7 @@ body:             |
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
     %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv
-    %3:ppr = PTRUE_B 31
+    %3:ppr = PTRUE_B 31, implicit $vg
     PTEST_PP killed %3, killed %2, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
     %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
@@ -409,14 +409,14 @@ body:             |
 
     ; CHECK-LABEL: name: cmpeq_imm_nxv16i8_ptest_not_all_active
     ; CHECK: %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv
-    ; CHECK-NEXT: %3:ppr = PTRUE_B 0
+    ; CHECK-NEXT: %3:ppr = PTRUE_B 0, implicit $vg
     ; CHECK-NEXT: PTEST_PP killed %3, killed %2, implicit-def $nzcv
     ; CHECK-NEXT: %4:gpr32 = COPY $wzr
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
     %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv
-    %3:ppr = PTRUE_B 0
+    %3:ppr = PTRUE_B 0, implicit $vg
     PTEST_PP killed %3, killed %2, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
     %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
@@ -446,14 +446,14 @@ body:             |
 
     ; CHECK-LABEL: name: cmpeq_imm_nxv16i8_ptest_of_halfs
     ; CHECK: %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv
-    ; CHECK-NEXT: %3:ppr = PTRUE_H 31
+    ; CHECK-NEXT: %3:ppr = PTRUE_H 31, implicit $vg
     ; CHECK-NEXT: PTEST_PP killed %3, killed %2, implicit-def $nzcv
     ; CHECK-NEXT: %4:gpr32 = COPY $wzr
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
     %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv
-    %3:ppr = PTRUE_H 31
+    %3:ppr = PTRUE_H 31, implicit $vg
     PTEST_PP killed %3, killed %2, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
     %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir
index 8f7467d99154e3..c1d9dfff73447c 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEGE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg, implicit $vg
     %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEGE_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEGE_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEGE_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEGE_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEGE_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEGE_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 0
+    %2:ppr = PTRUE_B 0, implicit $vg
     %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir
index 217d984560e36c..c6df21f85db773 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEGT_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg, implicit $vg
     %3:ppr = WHILEGT_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 1
+    %2:ppr = PTRUE_H 1, implicit $vg, implicit $vg
     %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir
index 8d6f466c6b735d..7d8aed3c325a01 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHI_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHI_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEHI_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEHI_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEHI_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEHI_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEHI_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 29
+    %2:ppr = PTRUE_S 29, implicit $vg
     %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir
index da76a30f843b7a..f4dbfbc3db1cab 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHS_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHS_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEHS_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEHS_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEHS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEHS_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEHS_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 30
+    %2:ppr = PTRUE_D 30, implicit $vg
     %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir
index 32954d593c1ddd..dc2265490cb55e 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELE_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELE_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELE_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELE_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELE_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELE_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 7
+    %2:ppr = PTRUE_B 7, implicit $vg
     %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir
index cca0ab8ef210b9..4d66e3e57da8b3 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELO_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELO_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELO_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELO_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELO_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELO_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELO_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 6
+    %2:ppr = PTRUE_H 6, implicit $vg
     %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir
index 4bae3a1986f451..ea02f8c70ef86c 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELS_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELS_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELS_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELS_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELS_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELS_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 5
+    %2:ppr = PTRUE_S 5, implicit $vg
     %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir
index 3c6a9e21b4c6c1..d08781f203e328 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELT_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELT_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELT_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELT_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELT_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELT_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 4
+    %2:ppr = PTRUE_D 4, implicit $vg
     %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir
index 27cdf593df776f..d800009b9537f3 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -65,7 +65,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILERW_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -100,7 +100,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILERW_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -135,7 +135,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILERW_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -175,7 +175,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 0
+    %2:ppr = PTRUE_B 0, implicit $vg
     %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -215,7 +215,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -255,7 +255,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -295,7 +295,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir
index 3b49b1ec2c8045..9f8b7c3197ecf2 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -65,7 +65,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEWR_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -100,7 +100,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEWR_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -135,7 +135,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEWR_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -175,7 +175,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 0
+    %2:ppr = PTRUE_B 0, implicit $vg
     %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -215,7 +215,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -255,7 +255,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -295,7 +295,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve2p1_copy_pnr.mir b/llvm/test/CodeGen/AArch64/sve2p1_copy_pnr.mir
index d6a87a42a79e00..5e5db2ac4e2079 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1_copy_pnr.mir
+++ b/llvm/test/CodeGen/AArch64/sve2p1_copy_pnr.mir
@@ -13,10 +13,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: pnr_to_ppr
-    ; CHECK: renamable $pn8 = PTRUE_C_D
+    ; CHECK: renamable $pn8 = PTRUE_C_D implicit $vg
     ; CHECK-NEXT: $p0 = ORR_PPzPP $p8, $p8, killed $p8
     ; CHECK-NEXT: RET_ReallyLR implicit killed $p0
-    renamable $pn8 = PTRUE_C_D
+    renamable $pn8 = PTRUE_C_D implicit $vg
     $p0 = COPY killed renamable $pn8
     RET_ReallyLR implicit killed $p0
 
@@ -34,10 +34,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: ppr_to_pnr
-    ; CHECK: renamable $p8 = PTRUE_H 31
+    ; CHECK: renamable $p8 = PTRUE_H 31, implicit $vg
     ; CHECK-NEXT: $p0 = ORR_PPzPP $p8, $p8, killed $p8, implicit-def $pn0
     ; CHECK-NEXT: RET_ReallyLR implicit killed $pn0
-    renamable $p8 = PTRUE_H 31
+    renamable $p8 = PTRUE_H 31, implicit $vg
     $pn0 = COPY killed renamable $p8
     RET_ReallyLR implicit killed $pn0
 
@@ -55,10 +55,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: pnr_to_pnr
-    ; CHECK: renamable $pn8 = PTRUE_C_H
+    ; CHECK: renamable $pn8 = PTRUE_C_H implicit $vg
     ; CHECK-NEXT: $p0 = ORR_PPzPP $p8, $p8, killed $p8, implicit-def $pn0
     ; CHECK-NEXT: RET_ReallyLR implicit killed $pn0
-    renamable $pn8 = PTRUE_C_H
+    renamable $pn8 = PTRUE_C_H implicit $vg
     $pn0 = COPY killed renamable $pn8
     RET_ReallyLR implicit killed $pn0
 

From c66f2d0c4a46ba66fb98a2cab4e63ad90888a261 Mon Sep 17 00:00:00 2001
From: Devajith Valaparambil Sreeramaswamy <devajithvs@gmail.com>
Date: Thu, 29 Feb 2024 07:46:48 -0800
Subject: [PATCH 31/55] [mlir-query] Add function extraction feature to
 mlir-query

This enables specifying the extract modifier to extract all matches into
a function. This currently does this very directly by converting all
operands to function arguments (ones due to results of other matched ops
are dropped) and all results as return values.

Differential Revision: https://reviews.llvm.org/D158693
---
 .../include/mlir/Query/Matcher/ErrorBuilder.h |  7 +-
 .../mlir/Query/Matcher/MatchersInternal.h     |  7 ++
 mlir/lib/Query/Matcher/Diagnostics.cpp        | 10 +++
 mlir/lib/Query/Matcher/Parser.cpp             | 67 ++++++++++++++--
 mlir/lib/Query/Matcher/Parser.h               | 18 +++--
 mlir/lib/Query/Matcher/RegistryManager.cpp    | 15 +++-
 mlir/lib/Query/Matcher/RegistryManager.h      |  1 +
 mlir/lib/Query/Query.cpp                      | 80 ++++++++++++++++++-
 mlir/test/mlir-query/function-extraction.mlir | 19 +++++
 9 files changed, 207 insertions(+), 17 deletions(-)
 create mode 100644 mlir/test/mlir-query/function-extraction.mlir

diff --git a/mlir/include/mlir/Query/Matcher/ErrorBuilder.h b/mlir/include/mlir/Query/Matcher/ErrorBuilder.h
index 1073daed8703f5..08f1f415cbd3e5 100644
--- a/mlir/include/mlir/Query/Matcher/ErrorBuilder.h
+++ b/mlir/include/mlir/Query/Matcher/ErrorBuilder.h
@@ -37,8 +37,12 @@ enum class ErrorType {
   None,
 
   // Parser Errors
+  ParserChainedExprInvalidArg,
+  ParserChainedExprNoCloseParen,
+  ParserChainedExprNoOpenParen,
   ParserFailedToBuildMatcher,
   ParserInvalidToken,
+  ParserMalformedChainedExpr,
   ParserNoCloseParen,
   ParserNoCode,
   ParserNoComma,
@@ -50,9 +54,10 @@ enum class ErrorType {
 
   // Registry Errors
   RegistryMatcherNotFound,
+  RegistryNotBindable,
   RegistryValueNotFound,
   RegistryWrongArgCount,
-  RegistryWrongArgType
+  RegistryWrongArgType,
 };
 
 void addError(Diagnostics *error, SourceRange range, ErrorType errorType,
diff --git a/mlir/include/mlir/Query/Matcher/MatchersInternal.h b/mlir/include/mlir/Query/Matcher/MatchersInternal.h
index 67455be592393b..117f7d4edef9e3 100644
--- a/mlir/include/mlir/Query/Matcher/MatchersInternal.h
+++ b/mlir/include/mlir/Query/Matcher/MatchersInternal.h
@@ -63,8 +63,15 @@ class DynMatcher {
 
   bool match(Operation *op) const { return implementation->match(op); }
 
+  void setFunctionName(StringRef name) { functionName = name.str(); };
+
+  bool hasFunctionName() const { return !functionName.empty(); };
+
+  StringRef getFunctionName() const { return functionName; };
+
 private:
   llvm::IntrusiveRefCntPtr<MatcherInterface> implementation;
+  std::string functionName;
 };
 
 } // namespace mlir::query::matcher
diff --git a/mlir/lib/Query/Matcher/Diagnostics.cpp b/mlir/lib/Query/Matcher/Diagnostics.cpp
index 10468dbcc53067..2a137e8fdfab0d 100644
--- a/mlir/lib/Query/Matcher/Diagnostics.cpp
+++ b/mlir/lib/Query/Matcher/Diagnostics.cpp
@@ -38,6 +38,8 @@ static llvm::StringRef errorTypeToFormatString(ErrorType type) {
     return "Incorrect type for arg $0. (Expected = $1) != (Actual = $2)";
   case ErrorType::RegistryValueNotFound:
     return "Value not found: $0";
+  case ErrorType::RegistryNotBindable:
+    return "Matcher does not support binding.";
 
   case ErrorType::ParserStringError:
     return "Error parsing string token: <$0>";
@@ -57,6 +59,14 @@ static llvm::StringRef errorTypeToFormatString(ErrorType type) {
     return "Unexpected end of code.";
   case ErrorType::ParserOverloadedType:
     return "Input value has unresolved overloaded type: $0";
+  case ErrorType::ParserMalformedChainedExpr:
+    return "Period not followed by valid chained call.";
+  case ErrorType::ParserChainedExprInvalidArg:
+    return "Missing/Invalid argument for the chained call.";
+  case ErrorType::ParserChainedExprNoCloseParen:
+    return "Missing ')' for the chained call.";
+  case ErrorType::ParserChainedExprNoOpenParen:
+    return "Missing '(' for the chained call.";
   case ErrorType::ParserFailedToBuildMatcher:
     return "Failed to build matcher: $0.";
 
diff --git a/mlir/lib/Query/Matcher/Parser.cpp b/mlir/lib/Query/Matcher/Parser.cpp
index 30eb4801fc03c1..3609e24f9939f7 100644
--- a/mlir/lib/Query/Matcher/Parser.cpp
+++ b/mlir/lib/Query/Matcher/Parser.cpp
@@ -26,12 +26,17 @@ struct Parser::TokenInfo {
     text = newText;
   }
 
+  // Known identifiers.
+  static const char *const ID_Extract;
+
   llvm::StringRef text;
   TokenKind kind = TokenKind::Eof;
   SourceRange range;
   VariantValue value;
 };
 
+const char *const Parser::TokenInfo::ID_Extract = "extract";
+
 class Parser::CodeTokenizer {
 public:
   // Constructor with matcherCode and error
@@ -298,6 +303,36 @@ bool Parser::parseIdentifierPrefixImpl(VariantValue *value) {
   return parseMatcherExpressionImpl(nameToken, openToken, ctor, value);
 }
 
+bool Parser::parseChainedExpression(std::string &argument) {
+  // Parse the parenthesized argument to .extract("foo")
+  // Note: EOF is handled inside the consume functions and would fail below when
+  // checking token kind.
+  const TokenInfo openToken = tokenizer->consumeNextToken();
+  const TokenInfo argumentToken = tokenizer->consumeNextTokenIgnoreNewlines();
+  const TokenInfo closeToken = tokenizer->consumeNextTokenIgnoreNewlines();
+
+  if (openToken.kind != TokenKind::OpenParen) {
+    error->addError(openToken.range, ErrorType::ParserChainedExprNoOpenParen);
+    return false;
+  }
+
+  if (argumentToken.kind != TokenKind::Literal ||
+      !argumentToken.value.isString()) {
+    error->addError(argumentToken.range,
+                    ErrorType::ParserChainedExprInvalidArg);
+    return false;
+  }
+
+  if (closeToken.kind != TokenKind::CloseParen) {
+    error->addError(closeToken.range, ErrorType::ParserChainedExprNoCloseParen);
+    return false;
+  }
+
+  // If all checks passed, extract the argument and return true.
+  argument = argumentToken.value.getString();
+  return true;
+}
+
 // Parse the arguments of a matcher
 bool Parser::parseMatcherArgs(std::vector<ParserValue> &args, MatcherCtor ctor,
                               const TokenInfo &nameToken, TokenInfo &endToken) {
@@ -364,13 +399,34 @@ bool Parser::parseMatcherExpressionImpl(const TokenInfo &nameToken,
     return false;
   }
 
+  std::string functionName;
+  if (tokenizer->peekNextToken().kind == TokenKind::Period) {
+    tokenizer->consumeNextToken();
+    TokenInfo chainCallToken = tokenizer->consumeNextToken();
+    if (chainCallToken.kind == TokenKind::CodeCompletion) {
+      addCompletion(chainCallToken, MatcherCompletion("extract(\"", "extract"));
+      return false;
+    }
+
+    if (chainCallToken.kind != TokenKind::Ident ||
+        chainCallToken.text != TokenInfo::ID_Extract) {
+      error->addError(chainCallToken.range,
+                      ErrorType::ParserMalformedChainedExpr);
+      return false;
+    }
+
+    if (chainCallToken.text == TokenInfo::ID_Extract &&
+        !parseChainedExpression(functionName))
+      return false;
+  }
+
   if (!ctor)
     return false;
   // Merge the start and end infos.
   SourceRange matcherRange = nameToken.range;
   matcherRange.end = endToken.range.end;
-  VariantMatcher result =
-      sema->actOnMatcherExpression(*ctor, matcherRange, args, error);
+  VariantMatcher result = sema->actOnMatcherExpression(
+      *ctor, matcherRange, functionName, args, error);
   if (result.isNull())
     return false;
   *value = result;
@@ -470,9 +526,10 @@ Parser::RegistrySema::lookupMatcherCtor(llvm::StringRef matcherName) {
 }
 
 VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
-    MatcherCtor ctor, SourceRange nameRange, llvm::ArrayRef<ParserValue> args,
-    Diagnostics *error) {
-  return RegistryManager::constructMatcher(ctor, nameRange, args, error);
+    MatcherCtor ctor, SourceRange nameRange, llvm::StringRef functionName,
+    llvm::ArrayRef<ParserValue> args, Diagnostics *error) {
+  return RegistryManager::constructMatcher(ctor, nameRange, functionName, args,
+                                           error);
 }
 
 std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
diff --git a/mlir/lib/Query/Matcher/Parser.h b/mlir/lib/Query/Matcher/Parser.h
index f049af34e9c907..58968023022d56 100644
--- a/mlir/lib/Query/Matcher/Parser.h
+++ b/mlir/lib/Query/Matcher/Parser.h
@@ -64,10 +64,9 @@ class Parser {
 
     // Process a matcher expression. The caller takes ownership of the Matcher
     // object returned.
-    virtual VariantMatcher
-    actOnMatcherExpression(MatcherCtor ctor, SourceRange nameRange,
-                           llvm::ArrayRef<ParserValue> args,
-                           Diagnostics *error) = 0;
+    virtual VariantMatcher actOnMatcherExpression(
+        MatcherCtor ctor, SourceRange nameRange, llvm::StringRef functionName,
+        llvm::ArrayRef<ParserValue> args, Diagnostics *error) = 0;
 
     // Look up a matcher by name in the matcher name found by the parser.
     virtual std::optional<MatcherCtor>
@@ -93,10 +92,11 @@ class Parser {
     std::optional<MatcherCtor>
     lookupMatcherCtor(llvm::StringRef matcherName) override;
 
-    VariantMatcher actOnMatcherExpression(MatcherCtor ctor,
-                                          SourceRange nameRange,
-                                          llvm::ArrayRef<ParserValue> args,
-                                          Diagnostics *error) override;
+    VariantMatcher actOnMatcherExpression(MatcherCtor Ctor,
+                                          SourceRange NameRange,
+                                          StringRef functionName,
+                                          ArrayRef<ParserValue> Args,
+                                          Diagnostics *Error) override;
 
     std::vector<ArgKind> getAcceptedCompletionTypes(
         llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) override;
@@ -153,6 +153,8 @@ class Parser {
   Parser(CodeTokenizer *tokenizer, const Registry &matcherRegistry,
          const NamedValueMap *namedValues, Diagnostics *error);
 
+  bool parseChainedExpression(std::string &argument);
+
   bool parseExpressionImpl(VariantValue *value);
 
   bool parseMatcherArgs(std::vector<ParserValue> &args, MatcherCtor ctor,
diff --git a/mlir/lib/Query/Matcher/RegistryManager.cpp b/mlir/lib/Query/Matcher/RegistryManager.cpp
index 01856aa8ffa67f..8c9197f4d00981 100644
--- a/mlir/lib/Query/Matcher/RegistryManager.cpp
+++ b/mlir/lib/Query/Matcher/RegistryManager.cpp
@@ -132,8 +132,19 @@ RegistryManager::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes,
 
 VariantMatcher RegistryManager::constructMatcher(
     MatcherCtor ctor, internal::SourceRange nameRange,
-    llvm::ArrayRef<ParserValue> args, internal::Diagnostics *error) {
-  return ctor->create(nameRange, args, error);
+    llvm::StringRef functionName, llvm::ArrayRef<ParserValue> args,
+    internal::Diagnostics *error) {
+  VariantMatcher out = ctor->create(nameRange, args, error);
+  if (functionName.empty() || out.isNull())
+    return out;
+
+  if (std::optional<DynMatcher> result = out.getDynMatcher()) {
+    result->setFunctionName(functionName);
+    return VariantMatcher::SingleMatcher(*result);
+  }
+
+  error->addError(nameRange, internal::ErrorType::RegistryNotBindable);
+  return {};
 }
 
 } // namespace mlir::query::matcher
diff --git a/mlir/lib/Query/Matcher/RegistryManager.h b/mlir/lib/Query/Matcher/RegistryManager.h
index 5f2867261225e7..e2026e97f83dcb 100644
--- a/mlir/lib/Query/Matcher/RegistryManager.h
+++ b/mlir/lib/Query/Matcher/RegistryManager.h
@@ -61,6 +61,7 @@ class RegistryManager {
 
   static VariantMatcher constructMatcher(MatcherCtor ctor,
                                          internal::SourceRange nameRange,
+                                         llvm::StringRef functionName,
                                          ArrayRef<ParserValue> args,
                                          internal::Diagnostics *error);
 };
diff --git a/mlir/lib/Query/Query.cpp b/mlir/lib/Query/Query.cpp
index 5c42e5a5f0a116..27db52b37dade0 100644
--- a/mlir/lib/Query/Query.cpp
+++ b/mlir/lib/Query/Query.cpp
@@ -8,6 +8,8 @@
 
 #include "mlir/Query/Query.h"
 #include "QueryParser.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/Query/Matcher/MatchFinder.h"
 #include "mlir/Query/QuerySession.h"
 #include "mlir/Support/LogicalResult.h"
@@ -34,6 +36,70 @@ static void printMatch(llvm::raw_ostream &os, QuerySession &qs, Operation *op,
                                      "\"" + binding + "\" binds here");
 }
 
+// TODO: Extract into a helper function that can be reused outside query
+// context.
+static Operation *extractFunction(std::vector<Operation *> &ops,
+                                  MLIRContext *context,
+                                  llvm::StringRef functionName) {
+  context->loadDialect<func::FuncDialect>();
+  OpBuilder builder(context);
+
+  // Collect data for function creation
+  std::vector<Operation *> slice;
+  std::vector<Value> values;
+  std::vector<Type> outputTypes;
+
+  for (auto *op : ops) {
+    // Return op's operands are propagated, but the op itself isn't needed.
+    if (!isa<func::ReturnOp>(op))
+      slice.push_back(op);
+
+    // All results are returned by the extracted function.
+    outputTypes.insert(outputTypes.end(), op->getResults().getTypes().begin(),
+                       op->getResults().getTypes().end());
+
+    // Track all values that need to be taken as input to function.
+    values.insert(values.end(), op->getOperands().begin(),
+                  op->getOperands().end());
+  }
+
+  // Create the function
+  FunctionType funcType =
+      builder.getFunctionType(ValueRange(values), outputTypes);
+  auto loc = builder.getUnknownLoc();
+  func::FuncOp funcOp = func::FuncOp::create(loc, functionName, funcType);
+
+  builder.setInsertionPointToEnd(funcOp.addEntryBlock());
+
+  // Map original values to function arguments
+  IRMapping mapper;
+  for (const auto &arg : llvm::enumerate(values))
+    mapper.map(arg.value(), funcOp.getArgument(arg.index()));
+
+  // Clone operations and build function body
+  std::vector<Operation *> clonedOps;
+  std::vector<Value> clonedVals;
+  for (Operation *slicedOp : slice) {
+    Operation *clonedOp =
+        clonedOps.emplace_back(builder.clone(*slicedOp, mapper));
+    clonedVals.insert(clonedVals.end(), clonedOp->result_begin(),
+                      clonedOp->result_end());
+  }
+  // Add return operation
+  builder.create<func::ReturnOp>(loc, clonedVals);
+
+  // Remove unused function arguments
+  size_t currentIndex = 0;
+  while (currentIndex < funcOp.getNumArguments()) {
+    if (funcOp.getArgument(currentIndex).use_empty())
+      funcOp.eraseArgument(currentIndex);
+    else
+      ++currentIndex;
+  }
+
+  return funcOp;
+}
+
 Query::~Query() = default;
 
 mlir::LogicalResult InvalidQuery::run(llvm::raw_ostream &os,
@@ -65,9 +131,21 @@ mlir::LogicalResult QuitQuery::run(llvm::raw_ostream &os,
 
 mlir::LogicalResult MatchQuery::run(llvm::raw_ostream &os,
                                     QuerySession &qs) const {
+  Operation *rootOp = qs.getRootOp();
   int matchCount = 0;
   std::vector<Operation *> matches =
-      matcher::MatchFinder().getMatches(qs.getRootOp(), matcher);
+      matcher::MatchFinder().getMatches(rootOp, matcher);
+
+  // An extract call is recognized by considering if the matcher has a name.
+  // TODO: Consider making the extract more explicit.
+  if (matcher.hasFunctionName()) {
+    auto functionName = matcher.getFunctionName();
+    Operation *function =
+        extractFunction(matches, rootOp->getContext(), functionName);
+    os << "\n" << *function << "\n\n";
+    return mlir::success();
+  }
+
   os << "\n";
   for (Operation *op : matches) {
     os << "Match #" << ++matchCount << ":\n\n";
diff --git a/mlir/test/mlir-query/function-extraction.mlir b/mlir/test/mlir-query/function-extraction.mlir
new file mode 100644
index 00000000000000..a783f65c6761bc
--- /dev/null
+++ b/mlir/test/mlir-query/function-extraction.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-query %s -c "m hasOpName(\"arith.mulf\").extract(\"testmul\")" | FileCheck %s
+
+// CHECK: func.func @testmul({{.*}}) -> (f32, f32, f32) {
+// CHECK:       %[[MUL0:.*]] = arith.mulf {{.*}} : f32
+// CHECK:       %[[MUL1:.*]] = arith.mulf {{.*}}, %[[MUL0]] : f32
+// CHECK:       %[[MUL2:.*]] = arith.mulf {{.*}} : f32
+// CHECK-NEXT:  return %[[MUL0]], %[[MUL1]], %[[MUL2]] : f32, f32, f32
+
+func.func @mixedOperations(%a: f32, %b: f32, %c: f32) -> f32 {
+  %sum0 = arith.addf %a, %b : f32
+  %sub0 = arith.subf %sum0, %c : f32
+  %mul0 = arith.mulf %a, %sub0 : f32
+  %sum1 = arith.addf %b, %c : f32
+  %mul1 = arith.mulf %sum1, %mul0 : f32
+  %sub2 = arith.subf %mul1, %a : f32
+  %sum2 = arith.addf %mul1, %b : f32
+  %mul2 = arith.mulf %sub2, %sum2 : f32
+  return %mul2 : f32
+}

From 58e476f757775313d8b2649dedb9a7c5d30d8e19 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 29 Feb 2024 10:48:23 -0500
Subject: [PATCH 32/55] [libc++] Map forward declaration headers for iostreams
 to <iosfwd> (#83327)

This seems more appropriate than mapping them to the headers that
contain actual definitions.
---
 libcxx/include/libcxx.imp             | 12 ++++++------
 libcxx/utils/generate_iwyu_mapping.py |  2 ++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
index 22fbea99b848bb..eeeae39ca101d9 100644
--- a/libcxx/include/libcxx.imp
+++ b/libcxx/include/libcxx.imp
@@ -425,17 +425,17 @@
   { include: [ "<__fwd/bit_reference.h>", "private", "<bitset>", "public" ] },
   { include: [ "<__fwd/bit_reference.h>", "private", "<vector>", "public" ] },
   { include: [ "<__fwd/complex.h>", "private", "<complex>", "public" ] },
-  { include: [ "<__fwd/fstream.h>", "private", "<fstream>", "public" ] },
+  { include: [ "<__fwd/fstream.h>", "private", "<iosfwd>", "public" ] },
   { include: [ "<__fwd/hash.h>", "private", "<functional>", "public" ] },
-  { include: [ "<__fwd/ios.h>", "private", "<ios>", "public" ] },
-  { include: [ "<__fwd/istream.h>", "private", "<istream>", "public" ] },
+  { include: [ "<__fwd/ios.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/istream.h>", "private", "<iosfwd>", "public" ] },
   { include: [ "<__fwd/mdspan.h>", "private", "<mdspan>", "public" ] },
   { include: [ "<__fwd/memory_resource.h>", "private", "<memory_resource>", "public" ] },
-  { include: [ "<__fwd/ostream.h>", "private", "<ostream>", "public" ] },
+  { include: [ "<__fwd/ostream.h>", "private", "<iosfwd>", "public" ] },
   { include: [ "<__fwd/pair.h>", "private", "<utility>", "public" ] },
   { include: [ "<__fwd/span.h>", "private", "<span>", "public" ] },
-  { include: [ "<__fwd/sstream.h>", "private", "<sstream>", "public" ] },
-  { include: [ "<__fwd/streambuf.h>", "private", "<streambuf>", "public" ] },
+  { include: [ "<__fwd/sstream.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/streambuf.h>", "private", "<iosfwd>", "public" ] },
   { include: [ "<__fwd/string.h>", "private", "<string>", "public" ] },
   { include: [ "<__fwd/string_view.h>", "private", "<string_view>", "public" ] },
   { include: [ "<__fwd/subrange.h>", "private", "<ranges>", "public" ] },
diff --git a/libcxx/utils/generate_iwyu_mapping.py b/libcxx/utils/generate_iwyu_mapping.py
index 0a650250e747f6..6eb2c6095bf1e7 100644
--- a/libcxx/utils/generate_iwyu_mapping.py
+++ b/libcxx/utils/generate_iwyu_mapping.py
@@ -40,6 +40,8 @@ def IWYU_mapping(header: str) -> typing.Optional[typing.List[str]]:
         return ["utility"]
     elif header == "__fwd/subrange.h":
         return ["ranges"]
+    elif re.match("__fwd/(fstream|ios|istream|ostream|sstream|streambuf)[.]h", header):
+        return ["iosfwd"]
     # Handle remaining forward declaration headers
     elif re.match("__fwd/(.+)[.]h", header):
         return [re.match("__fwd/(.+)[.]h", header).group(1)]

From 962f3e4e5bbdbfacb8c857a15013dae09bc49956 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa@quicinc.com>
Date: Thu, 29 Feb 2024 09:51:52 -0600
Subject: [PATCH 33/55] [Hexagon] Use the correct call to detect debug
 instructions (#83373)

---
 llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp    | 2 +-
 llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 3b8234c0118435..4c18e076c43936 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1002,7 +1002,7 @@ namespace {
 bool DeadCodeElimination::isDead(unsigned R) const {
   for (const MachineOperand &MO : MRI.use_operands(R)) {
     const MachineInstr *UseI = MO.getParent();
-    if (UseI->isDebugValue())
+    if (UseI->isDebugInstr())
       continue;
     if (UseI->isPHI()) {
       assert(!UseI->getOperand(0).getSubReg());
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index e38c8bacaf2bab..56472d633694ae 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -1180,7 +1180,7 @@ void HexagonPacketizerList::unpacketizeSoloInstrs(MachineFunction &MF) {
       bool InsertBeforeBundle;
       if (MI.isInlineAsm())
         InsertBeforeBundle = !hasWriteToReadDep(MI, *BundleIt, HRI);
-      else if (MI.isDebugValue())
+      else if (MI.isDebugInstr())
         InsertBeforeBundle = true;
       else
         continue;

From 147dc81c1d220a54e9e7d0d4dba7f4b69708ffb7 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray@arm.com>
Date: Thu, 29 Feb 2024 15:57:50 +0000
Subject: [PATCH 34/55] [ARM][AArch64] Enable FEAT_FHM for Arm Neoverse N2
 (#82613)

Correct an issue with Arm Neoverse N2 after it was
changed to a v9a core in change
f576cbe44eabb8a5ac0af817424a0d1e7c8fbf85:

  * FEAT_FHM should be enabled for this core.
---
 llvm/include/llvm/TargetParser/AArch64TargetParser.h | 4 ++--
 llvm/include/llvm/TargetParser/ARMTargetParser.def   | 4 ++--
 llvm/lib/Target/AArch64/AArch64.td                   | 2 +-
 llvm/lib/Target/ARM/ARM.td                           | 1 +
 llvm/unittests/TargetParser/TargetParserTest.cpp     | 7 ++++---
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 93e9ed46642dfc..404424beb01931 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -658,8 +658,8 @@ inline constexpr CpuInfo CpuInfos[] = {
           AArch64::AEK_SSBS}))},
     {"neoverse-n2", ARMV9A,
      (AArch64::ExtensionBitset(
-         {AArch64::AEK_BF16, AArch64::AEK_DOTPROD,
-          AArch64::AEK_FP16, AArch64::AEK_I8MM, AArch64::AEK_MTE,
+         {AArch64::AEK_BF16, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
+          AArch64::AEK_FP16FML, AArch64::AEK_I8MM, AArch64::AEK_MTE,
           AArch64::AEK_SB, AArch64::AEK_SSBS, AArch64::AEK_SVE,
           AArch64::AEK_SVE2, AArch64::AEK_SVE2BITPERM}))},
     {"neoverse-512tvb", ARMV8_4A,
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.def b/llvm/include/llvm/TargetParser/ARMTargetParser.def
index 1797a1b238d349..f0ddaa1459e567 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.def
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.def
@@ -346,8 +346,8 @@ ARM_CPU_NAME("cortex-x1c", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
 ARM_CPU_NAME("neoverse-n1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
              (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
 ARM_CPU_NAME("neoverse-n2", ARMV9A, FK_NEON_FP_ARMV8, false,
-             (ARM::AEK_BF16 | ARM::AEK_DOTPROD | ARM::AEK_I8MM | ARM::AEK_RAS |
-              ARM::AEK_SB))
+             (ARM::AEK_BF16 | ARM::AEK_DOTPROD | ARM::AEK_FP16FML |
+              ARM::AEK_I8MM | ARM::AEK_RAS | ARM::AEK_SB ))
 ARM_CPU_NAME("neoverse-v1", ARMV8_4A, FK_CRYPTO_NEON_FP_ARMV8, false,
              (ARM::AEK_RAS | ARM::AEK_FP16 | ARM::AEK_BF16 | ARM::AEK_DOTPROD))
 ARM_CPU_NAME("cyclone", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 169b00e5ebc989..b837066554f3c6 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -1514,7 +1514,7 @@ def ProcessorFeatures {
                                        FeatureFPARMv8, FeatureFullFP16, FeatureNEON,
                                        FeatureRCPC, FeatureSPE, FeatureSSBS,
                                        FeaturePerfMon];
-  list<SubtargetFeature> NeoverseN2 = [HasV9_0aOps, FeatureBF16, FeatureETE,
+  list<SubtargetFeature> NeoverseN2 = [HasV9_0aOps, FeatureBF16, FeatureETE, FeatureFP16FML,
                                        FeatureMatMulInt8, FeatureMTE, FeatureSVE2,
                                        FeatureSVE2BitPerm, FeatureTRBE,
                                        FeaturePerfMon];
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 877781568307dc..b62e1a032631fd 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -1682,6 +1682,7 @@ def : ProcNoItin<"neoverse-n1",                         [ARMv82a,
 
 def : ProcNoItin<"neoverse-n2",                         [ARMv9a,
                                                          FeatureBF16,
+                                                         FeatureFP16FML,
                                                          FeatureMatMulInt8]>;
 
 def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index e89fc687451cd7..297100441113a6 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -439,7 +439,7 @@ INSTANTIATE_TEST_SUITE_P(
                              ARM::AEK_HWDIVARM | ARM::AEK_MP | ARM::AEK_SEC |
                              ARM::AEK_VIRT | ARM::AEK_DSP | ARM::AEK_BF16 |
                              ARM::AEK_DOTPROD | ARM::AEK_RAS | ARM::AEK_I8MM |
-                             ARM::AEK_SB,
+                             ARM::AEK_FP16FML | ARM::AEK_SB,
             "9-A"),
         ARMCPUTestParams<uint64_t>("neoverse-v1", "armv8.4-a", "crypto-neon-fp-armv8",
             ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT |
@@ -1575,8 +1575,9 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_SB,          AArch64::AEK_SVE2,
                  AArch64::AEK_SVE2BITPERM, AArch64::AEK_BF16,
                  AArch64::AEK_I8MM,        AArch64::AEK_JSCVT,
-                 AArch64::AEK_FCMA,        AArch64::AEK_PAUTH})),
-            "8.5-A"),
+                 AArch64::AEK_FCMA,        AArch64::AEK_PAUTH,
+                 AArch64::AEK_FP16FML})),
+            "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "ampere1", "armv8.6-a", "crypto-neon-fp-armv8",
             (AArch64::ExtensionBitset(

From 310ed337092d6afc78594e1d8e32e6f44fff5568 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 29 Feb 2024 08:05:38 -0800
Subject: [PATCH 35/55] [Support] Use all_read | all_write for
 createTemporaryFile (#83360)

In a04879ce7dd6, dsymutil switched from using TempFile to
createTemporaryFile. This caused a regression because the two use
different permissions:

 - TempFile opens the file as all_read | all_write
 - createTemporaryFile opens the file as owner_read | owner_write

The latter turns out to be problematic for dsymutil because it either
promotes the temporary to a proper output file, or it would pass it to
`lipo` to create a universal binary and `lipo` preserves the permissions
of the input files. Either way, this caused issues when the build system
was run as a different user than the one ingesting the resulting
binaries.

I did some version control archeology and I couldn't find evidence that
these permissions were chosen purposely. Both could be considered
reasonable default.

This patch changes the permissions to `all read | all write` to make the
two consistent and match the one currently used by the higher level
abstraction (TempFile).

rdar://123722848
---
 llvm/lib/Support/Path.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index c8de2c0625aa26..acee228a0d0462 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -850,7 +850,7 @@ createTemporaryFile(const Twine &Model, int &ResultFD,
          "Model must be a simple filename.");
   // Use P.begin() so that createUniqueEntity doesn't need to recreate Storage.
   return createUniqueEntity(P.begin(), ResultFD, ResultPath, true, Type, Flags,
-                            owner_read | owner_write);
+                            all_read | all_write);
 }
 
 static std::error_code

From 5a0bd2a36591fe48c24de220a5f8ddef9ce6f0b1 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 28 Feb 2024 21:19:02 -0800
Subject: [PATCH 36/55] [lldb] Add pexpect to
 LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS

This executed Alex' idea [1] of adding pexpect to the list of "strict
test requirements" as we're planning to stop vendoring it. This will
ensure all the bots have the package before we toggle the default.

[1] https://github.com/llvm/llvm-project/pull/83191
---
 lldb/test/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt
index 7c31fd487f6ff8..2a9877c721e3b4 100644
--- a/lldb/test/CMakeLists.txt
+++ b/lldb/test/CMakeLists.txt
@@ -12,7 +12,8 @@ endif()
 if(LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS)
   message(STATUS "Enforcing strict test requirements for LLDB")
   set(useful_python_modules
-    psutil # Lit uses psutil to do per-test timeouts.
+    psutil  # Lit uses psutil to do per-test timeouts.
+    pexpect # We no longer vendor pexpect.
   )
   foreach(module ${useful_python_modules})
     lldb_find_python_module(${module})

From a344db793aca6881379c7c83f5112d2870dbf958 Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh@arm.com>
Date: Thu, 29 Feb 2024 16:31:05 +0000
Subject: [PATCH 37/55] [AArch64][GlobalISel] Legalize G_SHUFFLE_VECTOR for
 Odd-Sized Vectors (#83038)

Legalize Smaller/Larger than legal vectors with i8 and i16 element
sizes.
Vectors with elements smaller than i8 will get widened to i8 elements.
---
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h   |  12 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   1 +
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   3 +
 .../AArch64/GlobalISel/legalize-select.mir    |  72 +++---
 llvm/test/CodeGen/AArch64/shufflevector.ll    | 232 ++++++++++++------
 5 files changed, 212 insertions(+), 108 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 637c2c71b02411..6afaea3f3fc5c6 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -908,6 +908,18 @@ class LegalizeRuleSet {
         LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx, MinSize));
   }
 
+  /// Widen the scalar or vector element type to the next power of two that is
+  /// at least MinSize.  No effect if the scalar size is a power of two.
+  LegalizeRuleSet &widenScalarOrEltToNextPow2OrMinSize(unsigned TypeIdx,
+                                                       unsigned MinSize = 0) {
+    using namespace LegalityPredicates;
+    return actionIf(
+        LegalizeAction::WidenScalar,
+        any(scalarOrEltNarrowerThan(TypeIdx, MinSize),
+            scalarOrEltSizeNotPow2(typeIdx(TypeIdx))),
+        LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx, MinSize));
+  }
+
   LegalizeRuleSet &narrowScalar(unsigned TypeIdx, LegalizeMutation Mutation) {
     using namespace LegalityPredicates;
     return actionIf(LegalizeAction::NarrowScalar, isScalar(typeIdx(TypeIdx)),
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 8079f853aef855..1d016e684c48f6 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2495,6 +2495,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
   case TargetOpcode::G_SUB:
+  case TargetOpcode::G_SHUFFLE_VECTOR:
     // Perform operation at larger width (any extension is fines here, high bits
     // don't affect the result) and then truncate the result back to the
     // original type.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 91323e456a5ef8..84b4ecb7d2700a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -956,6 +956,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           },
           changeTo(1, 0))
       .moreElementsToNextPow2(0)
+      .widenScalarOrEltToNextPow2OrMinSize(0, 8)
+      .clampNumElements(0, v8s8, v16s8)
+      .clampNumElements(0, v4s16, v8s16)
       .clampNumElements(0, v4s32, v4s32)
       .clampNumElements(0, v2s64, v2s64)
       .moreElementsIf(
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
index 4879ffd28784c1..63a26dcfea4762 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
@@ -287,39 +287,47 @@ body:             |
     ; CHECK-NEXT: %q0:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: %q1:_(<4 x s32>) = COPY $q1
     ; CHECK-NEXT: %q2:_(<4 x s32>) = COPY $q2
-    ; CHECK-NEXT: %vec_cond0:_(<4 x s1>) = G_ICMP intpred(eq), %q0(<4 x s32>), %q1
-    ; CHECK-NEXT: %vec_cond1:_(<4 x s1>) = G_ICMP intpred(eq), %q0(<4 x s32>), %q2
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(eq), %q0(<4 x s32>), %q1
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(eq), %q0(<4 x s32>), %q2
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4100
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
-    ; CHECK-NEXT: %cmp:_(s1) = G_ICMP intpred(eq), %w0(s32), [[C]]
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT %cmp(s1)
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ZEXT]], 1
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[SEXT_INREG]](s32)
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C2]](s64)
-    ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s1>) = G_SHUFFLE_VECTOR [[IVEC]](<4 x s1>), [[DEF]], shufflemask(0, 0, 0, 0)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[C3]](s8)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s1>) = G_BUILD_VECTOR [[TRUNC1]](s1), [[TRUNC1]](s1), [[TRUNC1]](s1), [[TRUNC1]](s1)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[SHUF]](<4 x s1>)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[BUILD_VECTOR1]](<4 x s1>)
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[ANYEXT]], [[ANYEXT1]]
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[XOR]](<4 x s16>)
-    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT %vec_cond0(<4 x s1>)
-    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[SHUF]](<4 x s1>)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[ANYEXT2]], [[ANYEXT3]]
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[AND]](<4 x s16>)
-    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT %vec_cond1(<4 x s1>)
-    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC2]](<4 x s1>)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[ANYEXT4]], [[ANYEXT5]]
-    ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[AND1]](<4 x s16>)
-    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC3]](<4 x s1>)
-    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC4]](<4 x s1>)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[ANYEXT6]], [[ANYEXT7]]
-    ; CHECK-NEXT: %select:_(<4 x s1>) = G_TRUNC [[OR]](<4 x s16>)
-    ; CHECK-NEXT: %zext_select:_(<4 x s32>) = G_ZEXT %select(<4 x s1>)
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %w0(s32), [[C]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ICMP2]], 1
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY [[DEF1]](s16)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY [[DEF1]](s16)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[DEF1]](s16)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[COPY]](s16), [[COPY1]](s16), [[COPY2]](s16), [[DEF1]](s16)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[BUILD_VECTOR]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[IVEC]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16)
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
+    ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16)
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8)
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8)
+    ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<8 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 0, 0, 0, undef, undef, undef, undef)
+    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s8>), [[UV5:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[SHUF]](<8 x s8>)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[C2]](s16)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s16) = COPY [[C2]](s16)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s16) = COPY [[C2]](s16)
+    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[COPY3]](s16), [[COPY4]](s16), [[COPY5]](s16), [[C2]](s16)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[UV4]](<4 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[ANYEXT]], [[BUILD_VECTOR3]]
+    ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[UV4]](<4 x s8>)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC5]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC6]], [[XOR]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[OR]](<4 x s16>)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C3]](s32), [[C3]](s32), [[C3]](s32), [[C3]](s32)
+    ; CHECK-NEXT: %zext_select:_(<4 x s32>) = G_AND [[ANYEXT2]], [[BUILD_VECTOR4]]
     ; CHECK-NEXT: $q0 = COPY %zext_select(<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %w0:_(s32) = COPY $w0
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index df59eb8e629f44..5638347ee63340 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -3,15 +3,7 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; CHECK-GI:         warning: Instruction selection used fallback path for shufflevector_v2i1
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v32i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v2i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v16i16
 ; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v2i1_zeroes
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v4i8_zeroes
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v32i8_zeroes
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v2i16_zeroes
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v16i16_zeroes
 ; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v3i8
 ; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shufflevector_v3i8_zeroes
 
@@ -205,68 +197,142 @@ define <2 x i1> @shufflevector_v2i1(<2 x i1> %a, <2 x i1> %b){
 }
 
 define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
-; CHECK-LABEL: shufflevector_v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ext v0.8b, v1.8b, v0.8b, #6
-; CHECK-NEXT:    zip1 v1.4h, v1.4h, v0.4h
-; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
-; CHECK-NEXT:    xtn v0.8b, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    ext v0.8b, v1.8b, v0.8b, #6
+; CHECK-SD-NEXT:    zip1 v1.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
+; CHECK-GI-NEXT:    mov h4, v0.h[2]
+; CHECK-GI-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NEXT:    mov h6, v1.h[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov h2, v1.h[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[4], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[5], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[6], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI15_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 7>
     %d = bitcast <4 x i8> %c to i32
     ret i32 %d
 }
 
 define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b){
-; CHECK-LABEL: shufflevector_v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 def $q1_q2
-; CHECK-NEXT:    adrp x8, .LCPI16_0
-; CHECK-NEXT:    adrp x9, .LCPI16_1
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT:    ldr q4, [x9, :lo12:.LCPI16_1]
-; CHECK-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v3.16b
-; CHECK-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 def $q1_q2
+; CHECK-SD-NEXT:    adrp x8, .LCPI16_0
+; CHECK-SD-NEXT:    adrp x9, .LCPI16_1
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI16_0]
+; CHECK-SD-NEXT:    ldr q4, [x9, :lo12:.LCPI16_1]
+; CHECK-SD-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v3.16b
+; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v3.16b, v0.16b
+; CHECK-GI-NEXT:    adrp x8, .LCPI16_1
+; CHECK-GI-NEXT:    adrp x9, .LCPI16_0
+; CHECK-GI-NEXT:    mov v4.16b, v2.16b
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI16_1]
+; CHECK-GI-NEXT:    ldr q1, [x9, :lo12:.LCPI16_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v3.16b, v4.16b }, v0.16b
+; CHECK-GI-NEXT:    tbl v1.16b, { v3.16b, v4.16b }, v1.16b
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 4, i32 32, i32 32, i32 32, i32 5, i32 32, i32 32, i32 32, i32 6, i32 32, i32 32, i32 32, i32 7, i32 32, i32 32, i32 32>
     ret <32 x i8> %c
 }
 
 define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
-; CHECK-LABEL: shufflevector_v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    ldr w0, [sp, #12]
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [sp, #12]
+; CHECK-SD-NEXT:    strh w8, [sp, #14]
+; CHECK-SD-NEXT:    ldr w0, [sp, #12]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI17_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 1, i32 2>
     %d = bitcast <2 x i16> %c to i32
     ret i32 %d
 }
 
 define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b){
-; CHECK-LABEL: shufflevector_v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 def $q1_q2
-; CHECK-NEXT:    adrp x8, .LCPI18_0
-; CHECK-NEXT:    adrp x9, .LCPI18_1
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI18_0]
-; CHECK-NEXT:    ldr q4, [x9, :lo12:.LCPI18_1]
-; CHECK-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v3.16b
-; CHECK-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 def $q1_q2
+; CHECK-SD-NEXT:    adrp x8, .LCPI18_0
+; CHECK-SD-NEXT:    adrp x9, .LCPI18_1
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI18_0]
+; CHECK-SD-NEXT:    ldr q4, [x9, :lo12:.LCPI18_1]
+; CHECK-SD-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v3.16b
+; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v3.16b, v0.16b
+; CHECK-GI-NEXT:    adrp x8, .LCPI18_1
+; CHECK-GI-NEXT:    adrp x9, .LCPI18_0
+; CHECK-GI-NEXT:    mov v4.16b, v2.16b
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI18_1]
+; CHECK-GI-NEXT:    ldr q1, [x9, :lo12:.LCPI18_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v3.16b, v4.16b }, v0.16b
+; CHECK-GI-NEXT:    tbl v1.16b, { v3.16b, v4.16b }, v1.16b
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16>
     ret <16 x i16> %c
 }
@@ -332,16 +398,23 @@ define <2 x i1> @shufflevector_v2i1_zeroes(<2 x i1> %a, <2 x i1> %b){
 }
 
 define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
-; CHECK-LABEL: shufflevector_v4i8_zeroes:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup v0.4h, v0.h[0]
-; CHECK-NEXT:    xtn v0.8b, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v4i8_zeroes:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v4i8_zeroes:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    dup v0.8b, w8
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
     %d = bitcast <4 x i8> %c to i32
     ret i32 %d
@@ -358,19 +431,26 @@ define <32 x i8> @shufflevector_v32i8_zeroes(<32 x i8> %a, <32 x i8> %b){
 }
 
 define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
-; CHECK-LABEL: shufflevector_v2i16_zeroes:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup v1.2s, v0.s[0]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    ldr w0, [sp, #12]
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shufflevector_v2i16_zeroes:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v1.2s, v0.s[0]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [sp, #12]
+; CHECK-SD-NEXT:    mov w8, v1.s[1]
+; CHECK-SD-NEXT:    strh w8, [sp, #14]
+; CHECK-SD-NEXT:    ldr w0, [sp, #12]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shufflevector_v2i16_zeroes:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    dup v0.4h, w8
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
     %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 0, i32 0>
     %d = bitcast <2 x i16> %c to i32
     ret i32 %d

From 9491aecd235da9674150fe56bd0f29c3a22472c4 Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@apple.com>
Date: Thu, 29 Feb 2024 17:35:34 +0100
Subject: [PATCH 38/55] [TableGen][CodeGenTarget] Add support for v3i8 and v3i1
 MVTs. [NFCI] (#83140)

---
 llvm/utils/TableGen/CodeGenTarget.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index f26815c2f184fa..980c9bdb6367f7 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -91,6 +91,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::isVoid:   return "MVT::isVoid";
   case MVT::v1i1:     return "MVT::v1i1";
   case MVT::v2i1:     return "MVT::v2i1";
+  case MVT::v3i1:     return "MVT::v3i1";
   case MVT::v4i1:     return "MVT::v4i1";
   case MVT::v8i1:     return "MVT::v8i1";
   case MVT::v16i1:    return "MVT::v16i1";
@@ -107,6 +108,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v128i4:   return "MVT::v128i4";
   case MVT::v1i8:     return "MVT::v1i8";
   case MVT::v2i8:     return "MVT::v2i8";
+  case MVT::v3i8:     return "MVT::v3i8";
   case MVT::v4i8:     return "MVT::v4i8";
   case MVT::v8i8:     return "MVT::v8i8";
   case MVT::v16i8:    return "MVT::v16i8";

From 3fda50d3915b2163a54a37b602be7783a89dd808 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 29 Feb 2024 15:27:32 +0000
Subject: [PATCH 39/55] [NFC][RemoveDIs] Bulk update utilities to insert with
 iterators

As part of the RemoveDIs project we need LLVM to insert instructions using
iterators wherever possible, so that the iterators can carry a bit of
debug-info. This commit implements some of that by updating the contents of
llvm/lib/Transforms/Utils to always use iterator-versions of instruction
constructors.

There are two general flavours of update:
 * Almost all call-sites just call getIterator on an instruction
 * Several make use of an existing iterator (scenarios where the code is
   actually significant for debug-info)
The underlying logic is that any call to getFirstInsertionPt or similar
APIs that identify the start of a block need to have that iterator passed
directly to the insertion function, without being converted to a bare
Instruction pointer along the way.

I've also switched DemotePHIToStack to take an optional iterator: it needs
to take an iterator, and having a no-insert-location behaviour appears to
be important. The constructors for ICmpInst and FCmpInst have been updated
too. They're the only instructions that take block _references_ rather than
pointers for certain calls, and a future patch is going to make use of
default-null block insertion locations.

All of this should be NFC.
---
 llvm/include/llvm/IR/Instructions.h           |  8 ++---
 llvm/include/llvm/Transforms/Utils/Local.h    |  4 +--
 llvm/lib/IR/Instructions.cpp                  |  4 +--
 llvm/lib/Transforms/CFGuard/CFGuard.cpp       |  2 +-
 llvm/lib/Transforms/Scalar/Reg2Mem.cpp        |  6 ++--
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp |  8 ++---
 .../Transforms/Utils/BreakCriticalEdges.cpp   |  5 +--
 .../Transforms/Utils/CallPromotionUtils.cpp   |  8 ++---
 .../Utils/CanonicalizeFreezeInLoops.cpp       |  2 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   | 31 +++++++++----------
 .../lib/Transforms/Utils/DemoteRegToStack.cpp | 26 ++++++++--------
 .../Utils/EntryExitInstrumenter.cpp           |  6 ++--
 llvm/lib/Transforms/Utils/InlineFunction.cpp  | 18 +++++------
 llvm/lib/Transforms/Utils/Local.cpp           | 18 +++++------
 llvm/lib/Transforms/Utils/LoopConstrainer.cpp |  4 +--
 .../Transforms/Utils/LoopRotationUtils.cpp    |  2 +-
 llvm/lib/Transforms/Utils/LoopSimplify.cpp    |  2 +-
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      |  2 +-
 .../lib/Transforms/Utils/LoopUnrollAndJam.cpp |  6 ++--
 .../lib/Transforms/Utils/LowerGlobalDtors.cpp |  2 +-
 llvm/lib/Transforms/Utils/LowerInvoke.cpp     |  4 +--
 .../Transforms/Utils/LowerMemIntrinsics.cpp   | 10 +++---
 llvm/lib/Transforms/Utils/LowerSwitch.cpp     | 10 +++---
 llvm/lib/Transforms/Utils/MatrixUtils.cpp     |  2 +-
 .../Transforms/Utils/MemoryTaggingSupport.cpp |  4 +--
 llvm/lib/Transforms/Utils/SCCPSolver.cpp      |  6 ++--
 .../Utils/ScalarEvolutionExpander.cpp         |  4 +--
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 16 +++++-----
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp  | 14 ++++-----
 .../lib/Transforms/Utils/StripGCRelocates.cpp |  2 +-
 llvm/lib/Transforms/Utils/UnifyLoopExits.cpp  |  2 +-
 llvm/tools/bugpoint/Miscompilation.cpp        |  2 +-
 32 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index bc357074e5cb21..4e4cf71a349d74 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -1298,14 +1298,14 @@ class ICmpInst: public CmpInst {
 
   /// Constructor with insert-at-end semantics.
   ICmpInst(
-    BasicBlock &InsertAtEnd, ///< Block to insert into.
+    BasicBlock *InsertAtEnd, ///< Block to insert into.
     Predicate pred,  ///< The predicate to use for the comparison
     Value *LHS,      ///< The left-hand-side of the expression
     Value *RHS,      ///< The right-hand-side of the expression
     const Twine &NameStr = ""  ///< Name of the instruction
   ) : CmpInst(makeCmpResultType(LHS->getType()),
               Instruction::ICmp, pred, LHS, RHS, NameStr,
-              &InsertAtEnd) {
+              InsertAtEnd) {
 #ifndef NDEBUG
   AssertOK();
 #endif
@@ -1481,14 +1481,14 @@ class FCmpInst: public CmpInst {
 
   /// Constructor with insert-at-end semantics.
   FCmpInst(
-    BasicBlock &InsertAtEnd, ///< Block to insert into.
+    BasicBlock *InsertAtEnd, ///< Block to insert into.
     Predicate pred,  ///< The predicate to use for the comparison
     Value *LHS,      ///< The left-hand-side of the expression
     Value *RHS,      ///< The right-hand-side of the expression
     const Twine &NameStr = ""  ///< Name of the instruction
   ) : CmpInst(makeCmpResultType(LHS->getType()),
               Instruction::FCmp, pred, LHS, RHS, NameStr,
-              &InsertAtEnd) {
+              InsertAtEnd) {
     AssertOK();
   }
 
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 2df3c9049c7d62..8dc843d2eaf604 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -206,12 +206,12 @@ bool FoldBranchToCommonDest(BranchInst *BI, llvm::DomTreeUpdater *DTU = nullptr,
 /// to create a stack slot for X.
 AllocaInst *DemoteRegToStack(Instruction &X,
                              bool VolatileLoads = false,
-                             Instruction *AllocaPoint = nullptr);
+                             std::optional<BasicBlock::iterator> AllocaPoint = std::nullopt);
 
 /// This function takes a virtual register computed by a phi node and replaces
 /// it with a slot in the stack frame, allocated via alloca. The phi node is
 /// deleted and it returns the pointer to the alloca inserted.
-AllocaInst *DemotePHIToStack(PHINode *P, Instruction *AllocaPoint = nullptr);
+AllocaInst *DemotePHIToStack(PHINode *P, std::optional<BasicBlock::iterator> AllocaPoint = std::nullopt);
 
 /// If the specified pointer points to an object that we control, try to modify
 /// the object's alignment to PrefAlign. Returns a minimum known alignment of
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 42cdcad78228f6..c55d6cff84ed55 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -4608,10 +4608,10 @@ CmpInst *
 CmpInst::Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2,
                 const Twine &Name, BasicBlock *InsertAtEnd) {
   if (Op == Instruction::ICmp) {
-    return new ICmpInst(*InsertAtEnd, CmpInst::Predicate(predicate),
+    return new ICmpInst(InsertAtEnd, CmpInst::Predicate(predicate),
                         S1, S2, Name);
   }
-  return new FCmpInst(*InsertAtEnd, CmpInst::Predicate(predicate),
+  return new FCmpInst(InsertAtEnd, CmpInst::Predicate(predicate),
                       S1, S2, Name);
 }
 
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index 4d4306576017be..32326b5801031f 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -219,7 +219,7 @@ void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) {
   // Create a copy of the call/invoke instruction and add the new bundle.
   assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) &&
          "Unknown indirect call type");
-  CallBase *NewCB = CallBase::Create(CB, Bundles, CB);
+  CallBase *NewCB = CallBase::Create(CB, Bundles, CB->getIterator());
 
   // Change the target of the call to be the guard dispatch function.
   NewCB->setCalledOperand(GuardDispatchLoad);
diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 6c2b3e9bd4a721..ebc5075aa36fe8 100644
--- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -64,7 +64,7 @@ static bool runPass(Function &F) {
 
   CastInst *AllocaInsertionPoint = new BitCastInst(
       Constant::getNullValue(Type::getInt32Ty(F.getContext())),
-      Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);
+      Type::getInt32Ty(F.getContext()), "reg2mem alloca point", I);
 
   // Find the escaped instructions. But don't create stack slots for
   // allocas in entry block.
@@ -76,7 +76,7 @@ static bool runPass(Function &F) {
   // Demote escaped instructions
   NumRegsDemoted += WorkList.size();
   for (Instruction *I : WorkList)
-    DemoteRegToStack(*I, false, AllocaInsertionPoint);
+    DemoteRegToStack(*I, false, AllocaInsertionPoint->getIterator());
 
   WorkList.clear();
 
@@ -88,7 +88,7 @@ static bool runPass(Function &F) {
   // Demote phi nodes
   NumPhisDemoted += WorkList.size();
   for (Instruction *I : WorkList)
-    DemotePHIToStack(cast<PHINode>(I), AllocaInsertionPoint);
+    DemotePHIToStack(cast<PHINode>(I), AllocaInsertionPoint->getIterator());
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 5bb109a04ff178..5aa59acfa6df99 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1297,7 +1297,7 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
     // PHI.
     // Create the new PHI node, insert it into NewBB at the end of the block
     PHINode *NewPHI =
-        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
+        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI->getIterator());
 
     // NOTE! This loop walks backwards for a reason! First off, this minimizes
     // the cost of removal if we end up removing a large number of values, and
@@ -1517,7 +1517,7 @@ static void SplitLandingPadPredecessorsImpl(
       assert(!LPad->getType()->isTokenTy() &&
              "Split cannot be applied if LPad is token type. Otherwise an "
              "invalid PHINode of token type would be created.");
-      PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
+      PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad->getIterator());
       PN->addIncoming(Clone1, NewBB1);
       PN->addIncoming(Clone2, NewBB2);
       LPad->replaceAllUsesWith(PN);
@@ -1904,7 +1904,7 @@ static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
     auto Phi = cast<PHINode>(I);
     auto NewPhi =
         PHINode::Create(Phi->getType(), Incoming.size(),
-                        Phi->getName() + ".moved", &FirstGuardBlock->front());
+                        Phi->getName() + ".moved", FirstGuardBlock->begin());
     for (auto *In : Incoming) {
       Value *V = UndefValue::get(Phi->getType());
       if (In == Out) {
@@ -2023,7 +2023,7 @@ static void calcPredicateUsingInteger(
       Value *Id1 = ConstantInt::get(Type::getInt32Ty(Context),
                                     std::distance(Outgoing.begin(), Succ1Iter));
       IncomingId = SelectInst::Create(Condition, Id0, Id1, "target.bb.idx",
-                                      In->getTerminator());
+                                      In->getTerminator()->getIterator());
     } else {
       // Get the index of the non-null successor.
       auto SuccIter = Succ0 ? find(Outgoing, Succ0) : find(Outgoing, Succ1);
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 4a784c11c611d9..4606514cbc7175 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -420,7 +420,7 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
     // (b) Leave that as the only edge in the "Indirect" PHI.
     // (c) Merge the two in the body block.
     BasicBlock::iterator Indirect = Target->begin(),
-                         End = Target->getFirstNonPHI()->getIterator();
+                         End = Target->getFirstNonPHIIt();
     BasicBlock::iterator Direct = DirectSucc->begin();
     BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt();
 
@@ -430,6 +430,7 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
     while (Indirect != End) {
       PHINode *DirPHI = cast<PHINode>(Direct);
       PHINode *IndPHI = cast<PHINode>(Indirect);
+      BasicBlock::iterator InsertPt = Indirect;
 
       // Now, clean up - the direct block shouldn't get the indirect value,
       // and vice versa.
@@ -440,7 +441,7 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
       // PHI is erased.
       Indirect++;
 
-      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI);
+      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", InsertPt);
       NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred),
                              IBRPred);
 
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 4e84927f1cfc90..48c33d6c0c8e0e 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -168,12 +168,12 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 
   // Determine an appropriate location to create the bitcast for the return
   // value. The location depends on if we have a call or invoke instruction.
-  Instruction *InsertBefore = nullptr;
+  BasicBlock::iterator InsertBefore;
   if (auto *Invoke = dyn_cast<InvokeInst>(&CB))
     InsertBefore =
-        &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front();
+        SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->begin();
   else
-    InsertBefore = &*std::next(CB.getIterator());
+    InsertBefore = std::next(CB.getIterator());
 
   // Bitcast the return value to the correct type.
   auto *Cast = CastInst::CreateBitOrPointerCast(&CB, RetTy, "", InsertBefore);
@@ -509,7 +509,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
     Type *FormalTy = CalleeType->getParamType(ArgNo);
     Type *ActualTy = Arg->getType();
     if (FormalTy != ActualTy) {
-      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", &CB);
+      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", CB.getIterator());
       CB.setArgOperand(ArgNo, Cast);
 
       // Remove any incompatible attributes for the argument.
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 282c4456346678..40010aee9c1114 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -144,7 +144,7 @@ void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) {
   LLVM_DEBUG(dbgs() << "\tOperand: " << *U.get() << "\n");
 
   U.set(new FreezeInst(ValueToFr, ValueToFr->getName() + ".frozen",
-                       PH->getTerminator()));
+                       PH->getTerminator()->getIterator()));
 
   SE.forgetValue(UserI);
 }
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index bab065153f3efa..3071ec0c911321 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -570,7 +570,7 @@ void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
       LLVMContext &Ctx = M->getContext();
       auto *Int8PtrTy = PointerType::getUnqual(Ctx);
       CastInst *CastI =
-          CastInst::CreatePointerCast(AI, Int8PtrTy, "lt.cast", I);
+          CastInst::CreatePointerCast(AI, Int8PtrTy, "lt.cast", I->getIterator());
       I->replaceUsesOfWith(I->getOperand(1), CastI);
     }
 
@@ -1024,7 +1024,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       Value *Idx[2];
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), aggIdx);
-      Instruction *TI = newFunction->begin()->getTerminator();
+      BasicBlock::iterator TI = newFunction->begin()->getTerminator()->getIterator();
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
           StructTy, &*AggAI, Idx, "gep_" + inputs[i]->getName(), TI);
       RewriteVal = new LoadInst(StructTy->getElementType(aggIdx), GEP,
@@ -1173,7 +1173,7 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
       AllocaInst *alloca =
         new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
                        nullptr, output->getName() + ".loc",
-                       &codeReplacer->getParent()->front().front());
+                       codeReplacer->getParent()->front().begin());
       ReloadOutputs.push_back(alloca);
       params.push_back(alloca);
       ++ScalarOutputArgNo;
@@ -1192,8 +1192,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
     StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
     Struct = new AllocaInst(
         StructArgTy, DL.getAllocaAddrSpace(), nullptr, "structArg",
-        AllocationBlock ? &*AllocationBlock->getFirstInsertionPt()
-                        : &codeReplacer->getParent()->front().front());
+        AllocationBlock ? AllocationBlock->getFirstInsertionPt()
+                        : codeReplacer->getParent()->front().begin());
 
     if (ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) {
       auto *StructSpaceCast = new AddrSpaceCastInst(
@@ -1358,9 +1358,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
     else
       InsertPt = std::next(OutI->getIterator());
 
-    Instruction *InsertBefore = &*InsertPt;
-    assert((InsertBefore->getFunction() == newFunction ||
-            Blocks.count(InsertBefore->getParent())) &&
+    assert((InsertPt->getFunction() == newFunction ||
+            Blocks.count(InsertPt->getParent())) &&
            "InsertPt should be in new function");
     if (AggregateArgs && StructValues.contains(outputs[i])) {
       assert(AggOutputArgBegin != newFunction->arg_end() &&
@@ -1371,8 +1370,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
       Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), aggIdx);
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
           StructArgTy, &*AggOutputArgBegin, Idx, "gep_" + outputs[i]->getName(),
-          InsertBefore);
-      new StoreInst(outputs[i], GEP, InsertBefore);
+          InsertPt);
+      new StoreInst(outputs[i], GEP, InsertPt);
       ++aggIdx;
       // Since there should be only one struct argument aggregating
       // all the output values, we shouldn't increment AggOutputArgBegin, which
@@ -1381,7 +1380,7 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
       assert(ScalarOutputArgBegin != newFunction->arg_end() &&
              "Number of scalar output arguments should match "
              "the number of defined values");
-      new StoreInst(outputs[i], &*ScalarOutputArgBegin, InsertBefore);
+      new StoreInst(outputs[i], &*ScalarOutputArgBegin, InsertPt);
       ++ScalarOutputArgBegin;
     }
   }
@@ -1396,15 +1395,15 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
 
     // Check if the function should return a value
     if (OldFnRetTy->isVoidTy()) {
-      ReturnInst::Create(Context, nullptr, TheSwitch);  // Return void
+      ReturnInst::Create(Context, nullptr, TheSwitch->getIterator());  // Return void
     } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
       // return what we have
-      ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch);
+      ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch->getIterator());
     } else {
       // Otherwise we must have code extracted an unwind or something, just
       // return whatever we want.
       ReturnInst::Create(Context,
-                         Constant::getNullValue(OldFnRetTy), TheSwitch);
+                         Constant::getNullValue(OldFnRetTy), TheSwitch->getIterator());
     }
 
     TheSwitch->eraseFromParent();
@@ -1412,12 +1411,12 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   case 1:
     // Only a single destination, change the switch into an unconditional
     // branch.
-    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch);
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getIterator());
     TheSwitch->eraseFromParent();
     break;
   case 2:
     BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2),
-                       call, TheSwitch);
+                       call, TheSwitch->getIterator());
     TheSwitch->eraseFromParent();
     break;
   default:
diff --git a/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
index c894afee68a27a..b2a88eadd3dee8 100644
--- a/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
 /// invalidating the SSA information for the value.  It returns the pointer to
 /// the alloca inserted to create a stack slot for I.
 AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
-                                   Instruction *AllocaPoint) {
+                                   std::optional<BasicBlock::iterator> AllocaPoint) {
   if (I.use_empty()) {
     I.eraseFromParent();
     return nullptr;
@@ -33,10 +33,10 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
   AllocaInst *Slot;
   if (AllocaPoint) {
     Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
-                          I.getName()+".reg2mem", AllocaPoint);
+                          I.getName()+".reg2mem", *AllocaPoint);
   } else {
     Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
-                          I.getName() + ".reg2mem", &F->getEntryBlock().front());
+                          I.getName() + ".reg2mem", F->getEntryBlock().begin());
   }
 
   // We cannot demote invoke instructions to the stack if their normal edge
@@ -73,7 +73,7 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
             // Insert the load into the predecessor block
             V = new LoadInst(I.getType(), Slot, I.getName() + ".reload",
                              VolatileLoads,
-                             PN->getIncomingBlock(i)->getTerminator());
+                             PN->getIncomingBlock(i)->getTerminator()->getIterator());
             Loads[PN->getIncomingBlock(i)] = V;
           }
           PN->setIncomingValue(i, V);
@@ -82,7 +82,7 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     } else {
       // If this is a normal instruction, just insert a load.
       Value *V = new LoadInst(I.getType(), Slot, I.getName() + ".reload",
-                              VolatileLoads, U);
+                              VolatileLoads, U->getIterator());
       U->replaceUsesOfWith(&I, V);
     }
   }
@@ -99,7 +99,7 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
         break;
     if (isa<CatchSwitchInst>(InsertPt)) {
       for (BasicBlock *Handler : successors(&*InsertPt))
-        new StoreInst(&I, Slot, &*Handler->getFirstInsertionPt());
+        new StoreInst(&I, Slot, Handler->getFirstInsertionPt());
       return Slot;
     }
   } else {
@@ -107,14 +107,14 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     InsertPt = II.getNormalDest()->getFirstInsertionPt();
   }
 
-  new StoreInst(&I, Slot, &*InsertPt);
+  new StoreInst(&I, Slot, InsertPt);
   return Slot;
 }
 
 /// DemotePHIToStack - This function takes a virtual register computed by a PHI
 /// node and replaces it with a slot in the stack frame allocated via alloca.
 /// The PHI node is deleted. It returns the pointer to the alloca inserted.
-AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
+AllocaInst *llvm::DemotePHIToStack(PHINode *P, std::optional<BasicBlock::iterator> AllocaPoint) {
   if (P->use_empty()) {
     P->eraseFromParent();
     return nullptr;
@@ -126,12 +126,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
   AllocaInst *Slot;
   if (AllocaPoint) {
     Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
-                          P->getName()+".reg2mem", AllocaPoint);
+                          P->getName()+".reg2mem", *AllocaPoint);
   } else {
     Function *F = P->getParent()->getParent();
     Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
                           P->getName() + ".reg2mem",
-                          &F->getEntryBlock().front());
+                          F->getEntryBlock().begin());
   }
 
   // Iterate over each operand inserting a store in each predecessor.
@@ -141,7 +141,7 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
              "Invoke edge not supported yet"); (void)II;
     }
     new StoreInst(P->getIncomingValue(i), Slot,
-                  P->getIncomingBlock(i)->getTerminator());
+                  P->getIncomingBlock(i)->getTerminator()->getIterator());
   }
 
   // Insert a load in place of the PHI and replace all uses.
@@ -159,12 +159,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
     }
     for (Instruction *User : Users) {
       Value *V =
-          new LoadInst(P->getType(), Slot, P->getName() + ".reload", User);
+          new LoadInst(P->getType(), Slot, P->getName() + ".reload", User->getIterator());
       User->replaceUsesOfWith(P, V);
     }
   } else {
     Value *V =
-        new LoadInst(P->getType(), Slot, P->getName() + ".reload", &*InsertPt);
+        new LoadInst(P->getType(), Slot, P->getName() + ".reload", InsertPt);
     P->replaceAllUsesWith(V);
   }
   // Delete PHI.
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 092f1799755d17..f4207474e9a68a 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -20,7 +20,7 @@
 using namespace llvm;
 
 static void insertCall(Function &CurFn, StringRef Func,
-                       Instruction *InsertionPt, DebugLoc DL) {
+                       BasicBlock::iterator InsertionPt, DebugLoc DL) {
   Module &M = *InsertionPt->getParent()->getParent()->getParent();
   LLVMContext &C = InsertionPt->getParent()->getContext();
 
@@ -105,7 +105,7 @@ static bool runOnFunction(Function &F, bool PostInlining) {
     if (auto SP = F.getSubprogram())
       DL = DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
 
-    insertCall(F, EntryFunc, &*F.begin()->getFirstInsertionPt(), DL);
+    insertCall(F, EntryFunc, F.begin()->getFirstInsertionPt(), DL);
     Changed = true;
     F.removeFnAttr(EntryAttr);
   }
@@ -126,7 +126,7 @@ static bool runOnFunction(Function &F, bool PostInlining) {
       else if (auto SP = F.getSubprogram())
         DL = DILocation::get(SP->getContext(), 0, 0, SP);
 
-      insertCall(F, ExitFunc, T, DL);
+      insertCall(F, ExitFunc, T->getIterator(), DL);
       Changed = true;
     }
     F.removeFnAttr(ExitAttr);
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index f68fdb26f28173..0e8e72678de613 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -689,7 +689,7 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
     if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
       if (CRI->unwindsToCaller()) {
         auto *CleanupPad = CRI->getCleanupPad();
-        CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI);
+        CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI->getIterator());
         CRI->eraseFromParent();
         UpdatePHINodes(&*BB);
         // Finding a cleanupret with an unwind destination would confuse
@@ -737,7 +737,7 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
         auto *NewCatchSwitch = CatchSwitchInst::Create(
             CatchSwitch->getParentPad(), UnwindDest,
             CatchSwitch->getNumHandlers(), CatchSwitch->getName(),
-            CatchSwitch);
+            CatchSwitch->getIterator());
         for (BasicBlock *PadBB : CatchSwitch->handlers())
           NewCatchSwitch->addHandler(PadBB);
         // Propagate info for the old catchswitch over to the new one in
@@ -972,7 +972,7 @@ static void PropagateOperandBundles(Function::iterator InlinedBB,
     I->getOperandBundlesAsDefs(OpBundles);
     OpBundles.emplace_back("funclet", CallSiteEHPad);
 
-    Instruction *NewInst = CallBase::Create(I, OpBundles, I);
+    Instruction *NewInst = CallBase::Create(I, OpBundles, I->getIterator());
     NewInst->takeName(I);
     I->replaceAllUsesWith(NewInst);
     I->eraseFromParent();
@@ -2002,7 +2002,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
       Value *BundleArgs[] = {*objcarc::getAttachedARCFunction(&CB)};
       OperandBundleDef OB("clang.arc.attachedcall", BundleArgs);
       auto *NewCall = CallBase::addOperandBundle(
-          CI, LLVMContext::OB_clang_arc_attachedcall, OB, CI);
+          CI, LLVMContext::OB_clang_arc_attachedcall, OB, CI->getIterator());
       NewCall->copyMetadata(*CI);
       CI->replaceAllUsesWith(NewCall);
       CI->eraseFromParent();
@@ -2326,7 +2326,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
           OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
         }
 
-        Instruction *NewI = CallBase::Create(ICS, OpDefs, ICS);
+        Instruction *NewI = CallBase::Create(ICS, OpDefs, ICS->getIterator());
 
         // Note: the RAUW does the appropriate fixup in VMap, so we need to do
         // this even if the call returns void.
@@ -2479,7 +2479,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
           SmallVector<Value *, 6> Params(CI->args());
           Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
           CallInst *NewCI = CallInst::Create(
-              CI->getFunctionType(), CI->getCalledOperand(), Params, "", CI);
+              CI->getFunctionType(), CI->getCalledOperand(), Params, "", CI->getIterator());
           NewCI->setDebugLoc(CI->getDebugLoc());
           NewCI->setAttributes(Attrs);
           NewCI->setCallingConv(CI->getCallingConv());
@@ -2776,7 +2776,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     // If the call site was an invoke instruction, add a branch to the normal
     // destination.
     if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
-      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), &CB);
+      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), CB.getIterator());
       NewBr->setDebugLoc(Returns[0]->getDebugLoc());
     }
 
@@ -2813,7 +2813,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
 
     // Add an unconditional branch to make this look like the CallInst case...
-    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), &CB);
+    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), CB.getIterator());
 
     // Split the basic block.  This guarantees that no PHI nodes will have to be
     // updated due to new incoming edges, and make the invoke case more
@@ -2881,7 +2881,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     DebugLoc Loc;
     for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
       ReturnInst *RI = Returns[i];
-      BranchInst* BI = BranchInst::Create(AfterCallBB, RI);
+      BranchInst* BI = BranchInst::Create(AfterCallBB, RI->getIterator());
       Loc = RI->getDebugLoc();
       BI->setDebugLoc(Loc);
       RI->eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 075eeb5b19fd2b..c4a8843f2840b3 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2811,7 +2811,7 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
     if (DTU)
       UniqueSuccessors.insert(Successor);
   }
-  auto *UI = new UnreachableInst(I->getContext(), I);
+  auto *UI = new UnreachableInst(I->getContext(), I->getIterator());
   UI->setDebugLoc(I->getDebugLoc());
 
   // All instructions after this are dead.
@@ -2868,7 +2868,7 @@ CallInst *llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
 
   // Follow the call by a branch to the normal destination.
   BasicBlock *NormalDestBB = II->getNormalDest();
-  BranchInst::Create(NormalDestBB, II);
+  BranchInst::Create(NormalDestBB, II->getIterator());
 
   // Update PHI nodes in the unwind destination
   BasicBlock *BB = II->getParent();
@@ -3048,7 +3048,7 @@ static bool markAliveBlocks(Function &F,
             // jump to the normal destination branch.
             BasicBlock *NormalDestBB = II->getNormalDest();
             BasicBlock *UnwindDestBB = II->getUnwindDest();
-            BranchInst::Create(NormalDestBB, II);
+            BranchInst::Create(NormalDestBB, II->getIterator());
             UnwindDestBB->removePredecessor(II->getParent());
             II->eraseFromParent();
             if (DTU)
@@ -3131,12 +3131,12 @@ Instruction *llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
   BasicBlock *UnwindDest;
 
   if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
-    NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI);
+    NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI->getIterator());
     UnwindDest = CRI->getUnwindDest();
   } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
     auto *NewCatchSwitch = CatchSwitchInst::Create(
         CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(),
-        CatchSwitch->getName(), CatchSwitch);
+        CatchSwitch->getName(), CatchSwitch->getIterator());
     for (BasicBlock *PadBB : CatchSwitch->handlers())
       NewCatchSwitch->addHandler(PadBB);
 
@@ -3972,23 +3972,23 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
   // We may need to truncate the provider.
   if (DemandedTy != Provider->getType()) {
     auto *Trunc =
-        CastInst::CreateIntegerCast(Provider, DemandedTy, false, "trunc", I);
+        CastInst::CreateIntegerCast(Provider, DemandedTy, false, "trunc", I->getIterator());
     InsertedInsts.push_back(Trunc);
     Provider = Trunc;
   }
 
-  Instruction *Result = CallInst::Create(F, Provider, "rev", I);
+  Instruction *Result = CallInst::Create(F, Provider, "rev", I->getIterator());
   InsertedInsts.push_back(Result);
 
   if (!DemandedMask.isAllOnes()) {
     auto *Mask = ConstantInt::get(DemandedTy, DemandedMask);
-    Result = BinaryOperator::Create(Instruction::And, Result, Mask, "mask", I);
+    Result = BinaryOperator::Create(Instruction::And, Result, Mask, "mask", I->getIterator());
     InsertedInsts.push_back(Result);
   }
 
   // We may need to zeroextend back to the result type.
   if (ITy != Result->getType()) {
-    auto *ExtInst = CastInst::CreateIntegerCast(Result, ITy, false, "zext", I);
+    auto *ExtInst = CastInst::CreateIntegerCast(Result, ITy, false, "zext", I->getIterator());
     InsertedInsts.push_back(ExtInst);
   }
 
diff --git a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
index ea6d952cfa7d4f..81545ef375219c 100644
--- a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
+++ b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
@@ -644,7 +644,7 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
   // value of the same PHI nodes if/when we continue execution.
   for (PHINode &PN : LS.Header->phis()) {
     PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy",
-                                      BranchToContinuation);
+                                      BranchToContinuation->getIterator());
 
     NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader);
     NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch),
@@ -653,7 +653,7 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
   }
 
   RRI.IndVarEnd = PHINode::Create(IndVarBase->getType(), 2, "indvar.end",
-                                  BranchToContinuation);
+                                  BranchToContinuation->getIterator());
   RRI.IndVarEnd->addIncoming(IndVarStart, Preheader);
   RRI.IndVarEnd->addIncoming(IndVarBase, RRI.ExitSelector);
 
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index adae796ee02997..cec47810e04447 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -874,7 +874,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       // We can fold the conditional branch in the preheader, this makes things
       // simpler. The first step is to remove the extra edge to the Exit block.
       Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-      BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+      BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
       NewBI->setDebugLoc(PHBI->getDebugLoc());
       PHBI->eraseFromParent();
 
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 07e622b1577fe1..b38fecd6d8b416 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -399,7 +399,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
     PHINode *PN = cast<PHINode>(I);
     PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(),
-                                     PN->getName()+".be", BETerminator);
+                                     PN->getName()+".be", BETerminator->getIterator());
 
     // Loop over the PHI node, moving all entries except the one for the
     // preheader over to the new PHI node.
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index ee6f7b35750af0..6f0d000815726e 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -721,7 +721,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     DeadSucc->removePredecessor(Src, /* KeepOneInputPHIs */ true);
 
     // Replace the conditional branch with an unconditional one.
-    BranchInst::Create(Dest, Term);
+    BranchInst::Create(Dest, Term->getIterator());
     Term->eraseFromParent();
 
     DTUpdates.emplace_back(DominatorTree::Delete, Src, DeadSucc);
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 26b8c790f2a062..c7b88d3c48a69f 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -522,7 +522,7 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
     // unconditional one to this one
     BranchInst *SubTerm =
         cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
-    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
+    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm->getIterator());
     SubTerm->eraseFromParent();
 
     SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],
@@ -535,7 +535,7 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   // Aft blocks successors and phis
   BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
   if (CompletelyUnroll) {
-    BranchInst::Create(LoopExit, AftTerm);
+    BranchInst::Create(LoopExit, AftTerm->getIterator());
     AftTerm->eraseFromParent();
   } else {
     AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
@@ -550,7 +550,7 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
     // unconditional one to this one
     BranchInst *AftTerm =
         cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
-    BranchInst::Create(AftBlocksFirst[It], AftTerm);
+    BranchInst::Create(AftBlocksFirst[It], AftTerm->getIterator());
     AftTerm->eraseFromParent();
 
     AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
diff --git a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
index 4908535cba5411..791eeed9cb62cd 100644
--- a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
+++ b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
@@ -207,7 +207,7 @@ static bool runImpl(Module &M) {
       Value *Null = ConstantPointerNull::get(VoidStar);
       Value *Args[] = {CallDtors, Null, DsoHandle};
       Value *Res = CallInst::Create(AtExit, Args, "call", EntryBB);
-      Value *Cmp = new ICmpInst(*EntryBB, ICmpInst::ICMP_NE, Res,
+      Value *Cmp = new ICmpInst(EntryBB, ICmpInst::ICMP_NE, Res,
                                 Constant::getNullValue(Res->getType()));
       BranchInst::Create(FailBB, RetBB, Cmp, EntryBB);
 
diff --git a/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index 22ff3c2e5848fc..ff2ab3c6dce9c8 100644
--- a/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -52,7 +52,7 @@ static bool runImpl(Function &F) {
       // Insert a normal call instruction...
       CallInst *NewCall =
           CallInst::Create(II->getFunctionType(), II->getCalledOperand(),
-                           CallArgs, OpBundles, "", II);
+                           CallArgs, OpBundles, "", II->getIterator());
       NewCall->takeName(II);
       NewCall->setCallingConv(II->getCallingConv());
       NewCall->setAttributes(II->getAttributes());
@@ -60,7 +60,7 @@ static bool runImpl(Function &F) {
       II->replaceAllUsesWith(NewCall);
 
       // Insert an unconditional branch to the normal destination.
-      BranchInst::Create(II->getNormalDest(), II);
+      BranchInst::Create(II->getNormalDest(), II->getIterator());
 
       // Remove any PHI node entries from the exception destination.
       II->getUnwindDest()->removePredecessor(&BB);
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 88934a34d2e2a0..acd3f2802031ea 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -384,10 +384,10 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
   // structure. Its block terminators (unconditional branches) are replaced by
   // the appropriate conditional branches when the loop is built.
-  ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
+  ICmpInst *PtrCompare = new ICmpInst(InsertBefore->getIterator(), ICmpInst::ICMP_ULT,
                                       SrcAddr, DstAddr, "compare_src_dst");
   Instruction *ThenTerm, *ElseTerm;
-  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(), &ThenTerm,
                                 &ElseTerm);
 
   // Each part of the function consists of two blocks:
@@ -409,7 +409,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
   // between both backwards and forward copy clauses.
   ICmpInst *CompareN =
-      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+      new ICmpInst(OrigBB->getTerminator()->getIterator(), ICmpInst::ICMP_EQ, CopyLen,
                    ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
 
   // Copying backwards.
@@ -431,7 +431,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
       ExitBB, LoopBB);
   LoopPhi->addIncoming(IndexPtr, LoopBB);
   LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
-  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm->getIterator());
   ThenTerm->eraseFromParent();
 
   // Copying forward.
@@ -451,7 +451,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
   FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
 
-  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm->getIterator());
   ElseTerm->eraseFromParent();
 }
 
diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index 4131d36b572d74..f5921e5ccb0914 100644
--- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -165,20 +165,20 @@ BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
   if (Leaf.Low == Leaf.High) {
     // Make the seteq instruction...
     Comp =
-        new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val, Leaf.Low, "SwitchLeaf");
+        new ICmpInst(NewLeaf, ICmpInst::ICMP_EQ, Val, Leaf.Low, "SwitchLeaf");
   } else {
     // Make range comparison
     if (Leaf.Low == LowerBound) {
       // Val >= Min && Val <= Hi --> Val <= Hi
-      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High,
+      Comp = new ICmpInst(NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High,
                           "SwitchLeaf");
     } else if (Leaf.High == UpperBound) {
       // Val <= Max && Val >= Lo --> Val >= Lo
-      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SGE, Val, Leaf.Low,
+      Comp = new ICmpInst(NewLeaf, ICmpInst::ICMP_SGE, Val, Leaf.Low,
                           "SwitchLeaf");
     } else if (Leaf.Low->isZero()) {
       // Val >= 0 && Val <= Hi --> Val <=u Hi
-      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
+      Comp = new ICmpInst(NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
                           "SwitchLeaf");
     } else {
       // Emit V-Lo <=u Hi-Lo
@@ -186,7 +186,7 @@ BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
       Instruction *Add = BinaryOperator::CreateAdd(
           Val, NegLo, Val->getName() + ".off", NewLeaf);
       Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High);
-      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound,
+      Comp = new ICmpInst(NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound,
                           "SwitchLeaf");
     }
   }
diff --git a/llvm/lib/Transforms/Utils/MatrixUtils.cpp b/llvm/lib/Transforms/Utils/MatrixUtils.cpp
index e218773cf5da15..7866d6434c1156 100644
--- a/llvm/lib/Transforms/Utils/MatrixUtils.cpp
+++ b/llvm/lib/Transforms/Utils/MatrixUtils.cpp
@@ -36,7 +36,7 @@ BasicBlock *TileInfo::CreateLoop(BasicBlock *Preheader, BasicBlock *Exit,
   BranchInst::Create(Body, Header);
   BranchInst::Create(Latch, Body);
   PHINode *IV =
-      PHINode::Create(I32Ty, 2, Name + ".iv", Header->getTerminator());
+      PHINode::Create(I32Ty, 2, Name + ".iv", Header->getTerminator()->getIterator());
   IV->addIncoming(ConstantInt::get(I32Ty, 0), Preheader);
 
   B.SetInsertPoint(Latch);
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 2ff7c015107677..2ffe89a2458405 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -219,7 +219,7 @@ void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
   Type *PaddingType = ArrayType::get(Type::getInt8Ty(Ctx), AlignedSize - Size);
   Type *TypeWithPadding = StructType::get(AllocatedType, PaddingType);
   auto *NewAI = new AllocaInst(TypeWithPadding, Info.AI->getAddressSpace(),
-                               nullptr, "", Info.AI);
+                               nullptr, "", Info.AI->getIterator());
   NewAI->takeName(Info.AI);
   NewAI->setAlignment(Info.AI->getAlign());
   NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
@@ -230,7 +230,7 @@ void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
 
   // TODO: Remove when typed pointers dropped
   if (Info.AI->getType() != NewAI->getType())
-    NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI);
+    NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI->getIterator());
 
   Info.AI->replaceAllUsesWith(NewPtr);
   Info.AI->eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 3dc6016a0a373f..a185e8cd371c60 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -177,7 +177,7 @@ static bool replaceSignedInst(SCCPSolver &Solver,
     Value *Op0 = Inst.getOperand(0);
     if (InsertedValues.count(Op0) || !isNonNegative(Op0))
       return false;
-    NewInst = new ZExtInst(Op0, Inst.getType(), "", &Inst);
+    NewInst = new ZExtInst(Op0, Inst.getType(), "", Inst.getIterator());
     NewInst->setNonNeg();
     break;
   }
@@ -186,7 +186,7 @@ static bool replaceSignedInst(SCCPSolver &Solver,
     Value *Op0 = Inst.getOperand(0);
     if (InsertedValues.count(Op0) || !isNonNegative(Op0))
       return false;
-    NewInst = BinaryOperator::CreateLShr(Op0, Inst.getOperand(1), "", &Inst);
+    NewInst = BinaryOperator::CreateLShr(Op0, Inst.getOperand(1), "", Inst.getIterator());
     NewInst->setIsExact(Inst.isExact());
     break;
   }
@@ -199,7 +199,7 @@ static bool replaceSignedInst(SCCPSolver &Solver,
       return false;
     auto NewOpcode = Inst.getOpcode() == Instruction::SDiv ? Instruction::UDiv
                                                            : Instruction::URem;
-    NewInst = BinaryOperator::Create(NewOpcode, Op0, Op1, "", &Inst);
+    NewInst = BinaryOperator::Create(NewOpcode, Op0, Op1, "", Inst.getIterator());
     if (Inst.getOpcode() == Instruction::SDiv)
       NewInst->setIsExact(Inst.isExact());
     break;
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 0f67cc3ff4faca..3a3bcde7c3dcdd 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1267,7 +1267,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
         // corresponding to the back-edge.
         Instruction *Add = BinaryOperator::CreateAdd(CanonicalIV, One,
                                                      "indvar.next",
-                                                     HP->getTerminator());
+                                                     HP->getTerminator()->getIterator());
         Add->setDebugLoc(HP->getTerminator()->getDebugLoc());
         rememberInstruction(Add);
         CanonicalIV->addIncoming(Add, HP);
@@ -2232,7 +2232,7 @@ Value *SCEVExpander::fixupLCSSAFormFor(Value *V) {
   if (!PreserveLCSSA || !DefI)
     return V;
 
-  Instruction *InsertPt = &*Builder.GetInsertPoint();
+  BasicBlock::iterator InsertPt = Builder.GetInsertPoint();
   Loop *DefLoop = SE.LI.getLoopFor(DefI->getParent());
   Loop *UseLoop = SE.LI.getLoopFor(InsertPt->getParent());
   if (!DefLoop || UseLoop == DefLoop || DefLoop->contains(UseLoop))
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index bbdbfbbf776f32..fe65e52110f97a 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2710,7 +2710,7 @@ static void MergeCompatibleInvokesImpl(ArrayRef<InvokeInst *> Invokes,
 
     // Form a PHI out of all the data ops under this index.
     PHINode *PN = PHINode::Create(
-        U->getType(), /*NumReservedValues=*/Invokes.size(), "", MergedInvoke);
+        U->getType(), /*NumReservedValues=*/Invokes.size(), "", MergedInvoke->getIterator());
     for (InvokeInst *II : Invokes)
       PN->addIncoming(II->getOperand(U.getOperandNo()), II->getParent());
 
@@ -4663,7 +4663,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
     // terminator must be unreachable.
-    new UnreachableInst(OldTerm->getContext(), OldTerm);
+    new UnreachableInst(OldTerm->getContext(), OldTerm->getIterator());
   } else {
     // One of the selected values was a successor, but the other wasn't.
     // Insert an unconditional branch to the one that was found;
@@ -5353,7 +5353,7 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
       // or a degenerate conditional branch with matching destinations.
       if (all_of(BI->successors(),
                  [BB](auto *Successor) { return Successor == BB; })) {
-        new UnreachableInst(TI->getContext(), TI);
+        new UnreachableInst(TI->getContext(), TI->getIterator());
         TI->eraseFromParent();
         Changed = true;
       } else {
@@ -5452,7 +5452,7 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
             removeUnwindEdge(EHPred, DTU);
         }
         // The catchswitch is no longer reachable.
-        new UnreachableInst(CSI->getContext(), CSI);
+        new UnreachableInst(CSI->getContext(), CSI->getIterator());
         CSI->eraseFromParent();
         Changed = true;
       }
@@ -5462,7 +5462,7 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
              "Expected to always have an unwind to BB.");
       if (DTU)
         Updates.push_back({DominatorTree::Delete, Predecessor, BB});
-      new UnreachableInst(TI->getContext(), TI);
+      new UnreachableInst(TI->getContext(), TI->getIterator());
       TI->eraseFromParent();
       Changed = true;
     }
@@ -6604,7 +6604,7 @@ static void reuseTableCompare(
     // The compare yields the same result, just inverted. We can replace it.
     Value *InvertedTableCmp = BinaryOperator::CreateXor(
         RangeCmp, ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
-        RangeCheckBranch);
+        RangeCheckBranch->getIterator());
     CmpInst->replaceAllUsesWith(InvertedTableCmp);
     ++NumTableCmpReuses;
   }
@@ -7155,14 +7155,14 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
 
   if (IBI->getNumDestinations() == 0) {
     // If the indirectbr has no successors, change it to unreachable.
-    new UnreachableInst(IBI->getContext(), IBI);
+    new UnreachableInst(IBI->getContext(), IBI->getIterator());
     EraseTerminatorAndDCECond(IBI);
     return true;
   }
 
   if (IBI->getNumDestinations() == 1) {
     // If the indirectbr has one successor, change it to a direct branch.
-    BranchInst::Create(IBI->getDestination(0), IBI);
+    BranchInst::Create(IBI->getDestination(0), IBI->getIterator());
     EraseTerminatorAndDCECond(IBI);
     return true;
   }
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 297cfe5124d85d..b8fa985fa3462e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -301,7 +301,7 @@ bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
   if (SE->isKnownNonNegative(N) && SE->isKnownNonNegative(D)) {
     auto *UDiv = BinaryOperator::Create(
         BinaryOperator::UDiv, SDiv->getOperand(0), SDiv->getOperand(1),
-        SDiv->getName() + ".udiv", SDiv);
+        SDiv->getName() + ".udiv", SDiv->getIterator());
     UDiv->setIsExact(SDiv->isExact());
     SDiv->replaceAllUsesWith(UDiv);
     LLVM_DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
@@ -318,7 +318,7 @@ bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
 void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
   auto *N = Rem->getOperand(0), *D = Rem->getOperand(1);
   auto *URem = BinaryOperator::Create(BinaryOperator::URem, N, D,
-                                      Rem->getName() + ".urem", Rem);
+                                      Rem->getName() + ".urem", Rem->getIterator());
   Rem->replaceAllUsesWith(URem);
   LLVM_DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
   ++NumSimplifiedSRem;
@@ -339,9 +339,9 @@ void SimplifyIndvar::replaceRemWithNumerator(BinaryOperator *Rem) {
 void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) {
   auto *T = Rem->getType();
   auto *N = Rem->getOperand(0), *D = Rem->getOperand(1);
-  ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, N, D);
+  ICmpInst *ICmp = new ICmpInst(Rem->getIterator(), ICmpInst::ICMP_EQ, N, D);
   SelectInst *Sel =
-      SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem);
+      SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem->getIterator());
   Rem->replaceAllUsesWith(Sel);
   LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
   ++NumElimRem;
@@ -411,7 +411,7 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(WithOverflowInst *WO) {
   // intrinsic as well.
 
   BinaryOperator *NewResult = BinaryOperator::Create(
-      WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), "", WO);
+      WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), "", WO->getIterator());
 
   if (WO->isSigned())
     NewResult->setHasNoSignedWrap(true);
@@ -449,7 +449,7 @@ bool SimplifyIndvar::eliminateSaturatingIntrinsic(SaturatingInst *SI) {
     return false;
 
   BinaryOperator *BO = BinaryOperator::Create(
-      SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI);
+      SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI->getIterator());
   if (SI->isSigned())
     BO->setHasNoSignedWrap();
   else
@@ -1767,7 +1767,7 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
 
         PHINode *WidePhi =
           PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
-                          UsePhi);
+                          UsePhi->getIterator());
         WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
         BasicBlock *WidePhiBB = WidePhi->getParent();
         IRBuilder<> Builder(WidePhiBB, WidePhiBB->getFirstInsertionPt());
diff --git a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
index 6094f36a77f45c..3ae76ffd5ecab4 100644
--- a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -42,7 +42,7 @@ static bool stripGCRelocates(Function &F) {
     // All gc_relocates are i8 addrspace(1)* typed, we need a bitcast from i8
     // addrspace(1)* to the type of the OrigPtr, if the are not the same.
     if (GCRel->getType() != OrigPtr->getType())
-      ReplaceGCRel = new BitCastInst(OrigPtr, GCRel->getType(), "cast", GCRel);
+      ReplaceGCRel = new BitCastInst(OrigPtr, GCRel->getType(), "cast", GCRel->getIterator());
 
     // Replace all uses of gc.relocate and delete the gc.relocate
     // There maybe unncessary bitcasts back to the OrigPtr type, an instcombine
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 2f37f7f972cbbf..1d51f61351fe27 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -119,7 +119,7 @@ static void restoreSSA(const DominatorTree &DT, const Loop *L,
     LLVM_DEBUG(dbgs() << "externally used: " << Def->getName() << "\n");
     auto NewPhi =
         PHINode::Create(Def->getType(), Incoming.size(),
-                        Def->getName() + ".moved", &LoopExitBlock->front());
+                        Def->getName() + ".moved", LoopExitBlock->begin());
     for (auto *In : Incoming) {
       LLVM_DEBUG(dbgs() << "predecessor " << In->getName() << ": ");
       if (Def->getParent() == In || DT.dominates(Def, In)) {
diff --git a/llvm/tools/bugpoint/Miscompilation.cpp b/llvm/tools/bugpoint/Miscompilation.cpp
index 22806bab83eecf..b165b8220c20bd 100644
--- a/llvm/tools/bugpoint/Miscompilation.cpp
+++ b/llvm/tools/bugpoint/Miscompilation.cpp
@@ -884,7 +884,7 @@ CleanupAndPrepareModules(BugDriver &BD, std::unique_ptr<Module> Test,
           // Check to see if we already looked up the value.
           Value *CachedVal =
               new LoadInst(F->getType(), Cache, "fpcache", EntryBB);
-          Value *IsNull = new ICmpInst(*EntryBB, ICmpInst::ICMP_EQ, CachedVal,
+          Value *IsNull = new ICmpInst(EntryBB, ICmpInst::ICMP_EQ, CachedVal,
                                        NullPtr, "isNull");
           BranchInst::Create(LookupBB, DoCallBB, IsNull, EntryBB);
 

From c4c35d9522d19a0a511ee2cc011a21661e69f3c0 Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Thu, 29 Feb 2024 17:43:01 +0100
Subject: [PATCH 40/55] [Clang] [Sema] Handle `this` in `__restrict`-qualified
 member functions properly (#83187)

When resolving the type of `this` inside a member function, we were
attaching all qualifiers present on the member function to the class
type and then making it a pointer; however, `__restrict`, unlike `const`
and `volatile`, needs to be attached to the pointer type rather than the
pointee type.

This fixes #82941, #42411, and #18121.
---
 clang/docs/ReleaseNotes.rst          |  5 ++
 clang/lib/AST/DeclCXX.cpp            | 15 +++++-
 clang/lib/Sema/SemaExprCXX.cpp       |  2 +-
 clang/test/SemaCXX/restrict-this.cpp | 69 ++++++++++++++++++++++++++++
 4 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/SemaCXX/restrict-this.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a5c6b80c4e99e1..f44fef28b9f17f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -293,6 +293,11 @@ Bug Fixes to C++ Support
   lookup searches the bases of an incomplete class.
 - Fix a crash when an unresolved overload set is encountered on the RHS of a ``.*`` operator.
   (`#53815 <https://github.com/llvm/llvm-project/issues/53815>`_)
+- In ``__restrict``-qualified member functions, attach ``__restrict`` to the pointer type of
+  ``this`` rather than the pointee type.
+  Fixes (`#82941 <https://github.com/llvm/llvm-project/issues/82941>`_),
+  (`#42411 <https://github.com/llvm/llvm-project/issues/42411>`_), and
+  (`#18121 <https://github.com/llvm/llvm-project/issues/18121>`_).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 117e802dae2d9d..b4f2327d9c560a 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -2543,8 +2543,19 @@ QualType CXXMethodDecl::getThisType(const FunctionProtoType *FPT,
                                     const CXXRecordDecl *Decl) {
   ASTContext &C = Decl->getASTContext();
   QualType ObjectTy = ::getThisObjectType(C, FPT, Decl);
-  return C.getLangOpts().HLSL ? C.getLValueReferenceType(ObjectTy)
-                              : C.getPointerType(ObjectTy);
+
+  // Unlike 'const' and 'volatile', a '__restrict' qualifier must be
+  // attached to the pointer type, not the pointee.
+  bool Restrict = FPT->getMethodQuals().hasRestrict();
+  if (Restrict)
+    ObjectTy.removeLocalRestrict();
+
+  ObjectTy = C.getLangOpts().HLSL ? C.getLValueReferenceType(ObjectTy)
+                                  : C.getPointerType(ObjectTy);
+
+  if (Restrict)
+    ObjectTy.addRestrict();
+  return ObjectTy;
 }
 
 QualType CXXMethodDecl::getThisType() const {
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 59758d3bd6d1a3..c4750ce78fa9c1 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1220,7 +1220,7 @@ static QualType adjustCVQualifiersForCXXThisWithinLambda(
                     : nullptr;
     }
   }
-  return ASTCtx.getPointerType(ClassType);
+  return ThisTy;
 }
 
 QualType Sema::getCurrentThisType() {
diff --git a/clang/test/SemaCXX/restrict-this.cpp b/clang/test/SemaCXX/restrict-this.cpp
new file mode 100644
index 00000000000000..e78c8e0d56e2f8
--- /dev/null
+++ b/clang/test/SemaCXX/restrict-this.cpp
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -verify -fsyntax-only %s
+// expected-no-diagnostics
+
+struct C {
+  void f() __restrict {
+    static_assert(__is_same(decltype(this), C *__restrict));
+    (void) [this]() {
+      static_assert(__is_same(decltype(this), C *__restrict));
+      (void) [this]() { static_assert(__is_same(decltype(this), C *__restrict)); };
+
+      // By-value capture means 'this' is now a different object; do not
+      // make it __restrict.
+      (void) [*this]() { static_assert(__is_same(decltype(this), const C *)); };
+      (void) [*this]() mutable { static_assert(__is_same(decltype(this), C *)); };
+    };
+  }
+};
+
+template <typename T> struct TC {
+  void f() __restrict {
+    static_assert(__is_same(decltype(this), TC<int> *__restrict));
+    (void) [this]() {
+      static_assert(__is_same(decltype(this), TC<int> *__restrict));
+      (void) [this]() { static_assert(__is_same(decltype(this), TC<int> *__restrict)); };
+
+      // By-value capture means 'this' is now a different object; do not
+      // make it __restrict.
+      (void) [*this]() { static_assert(__is_same(decltype(this), const TC<int> *)); };
+      (void) [*this]() mutable { static_assert(__is_same(decltype(this), TC<int> *)); };
+    };
+  }
+};
+
+void f() {
+  TC<int>{}.f();
+}
+
+namespace gh18121 {
+struct Foo {
+  void member() __restrict {
+    Foo *__restrict This = this;
+  }
+};
+}
+
+namespace gh42411 {
+struct foo {
+    int v;
+    void f() const __restrict {
+        static_assert(__is_same(decltype((v)), const int&));
+        (void) [this]() { static_assert(__is_same(decltype((v)), const int&)); };
+    }
+};
+}
+
+namespace gh82941 {
+void f(int& x) {
+    (void)x;
+}
+
+class C {
+    int x;
+    void g() __restrict;
+};
+
+void C::g() __restrict {
+    f(this->x);
+}
+}

From 6c7805d5d186a6d1263f90b8033ad85e2d2633d7 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 29 Feb 2024 16:49:40 +0000
Subject: [PATCH 41/55] Revert "[NFC][RemoveDIs] Bulk update utilities to
 insert with iterators"

This reverts commit 3fda50d3915b2163a54a37b602be7783a89dd808.

Apparently I've missed a hunk while staging this; will back out for now.

Picked up here: https://lab.llvm.org/buildbot/#/builders/139/builds/60429/steps/6/logs/stdio
---
 llvm/include/llvm/IR/Instructions.h           |  8 ++---
 llvm/include/llvm/Transforms/Utils/Local.h    |  4 +--
 llvm/lib/IR/Instructions.cpp                  |  4 +--
 llvm/lib/Transforms/CFGuard/CFGuard.cpp       |  2 +-
 llvm/lib/Transforms/Scalar/Reg2Mem.cpp        |  6 ++--
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp |  8 ++---
 .../Transforms/Utils/BreakCriticalEdges.cpp   |  5 ++-
 .../Transforms/Utils/CallPromotionUtils.cpp   |  8 ++---
 .../Utils/CanonicalizeFreezeInLoops.cpp       |  2 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   | 31 ++++++++++---------
 .../lib/Transforms/Utils/DemoteRegToStack.cpp | 26 ++++++++--------
 .../Utils/EntryExitInstrumenter.cpp           |  6 ++--
 llvm/lib/Transforms/Utils/InlineFunction.cpp  | 18 +++++------
 llvm/lib/Transforms/Utils/Local.cpp           | 18 +++++------
 llvm/lib/Transforms/Utils/LoopConstrainer.cpp |  4 +--
 .../Transforms/Utils/LoopRotationUtils.cpp    |  2 +-
 llvm/lib/Transforms/Utils/LoopSimplify.cpp    |  2 +-
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      |  2 +-
 .../lib/Transforms/Utils/LoopUnrollAndJam.cpp |  6 ++--
 .../lib/Transforms/Utils/LowerGlobalDtors.cpp |  2 +-
 llvm/lib/Transforms/Utils/LowerInvoke.cpp     |  4 +--
 .../Transforms/Utils/LowerMemIntrinsics.cpp   | 10 +++---
 llvm/lib/Transforms/Utils/LowerSwitch.cpp     | 10 +++---
 llvm/lib/Transforms/Utils/MatrixUtils.cpp     |  2 +-
 .../Transforms/Utils/MemoryTaggingSupport.cpp |  4 +--
 llvm/lib/Transforms/Utils/SCCPSolver.cpp      |  6 ++--
 .../Utils/ScalarEvolutionExpander.cpp         |  4 +--
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 16 +++++-----
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp  | 14 ++++-----
 .../lib/Transforms/Utils/StripGCRelocates.cpp |  2 +-
 llvm/lib/Transforms/Utils/UnifyLoopExits.cpp  |  2 +-
 llvm/tools/bugpoint/Miscompilation.cpp        |  2 +-
 32 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 4e4cf71a349d74..bc357074e5cb21 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -1298,14 +1298,14 @@ class ICmpInst: public CmpInst {
 
   /// Constructor with insert-at-end semantics.
   ICmpInst(
-    BasicBlock *InsertAtEnd, ///< Block to insert into.
+    BasicBlock &InsertAtEnd, ///< Block to insert into.
     Predicate pred,  ///< The predicate to use for the comparison
     Value *LHS,      ///< The left-hand-side of the expression
     Value *RHS,      ///< The right-hand-side of the expression
     const Twine &NameStr = ""  ///< Name of the instruction
   ) : CmpInst(makeCmpResultType(LHS->getType()),
               Instruction::ICmp, pred, LHS, RHS, NameStr,
-              InsertAtEnd) {
+              &InsertAtEnd) {
 #ifndef NDEBUG
   AssertOK();
 #endif
@@ -1481,14 +1481,14 @@ class FCmpInst: public CmpInst {
 
   /// Constructor with insert-at-end semantics.
   FCmpInst(
-    BasicBlock *InsertAtEnd, ///< Block to insert into.
+    BasicBlock &InsertAtEnd, ///< Block to insert into.
     Predicate pred,  ///< The predicate to use for the comparison
     Value *LHS,      ///< The left-hand-side of the expression
     Value *RHS,      ///< The right-hand-side of the expression
     const Twine &NameStr = ""  ///< Name of the instruction
   ) : CmpInst(makeCmpResultType(LHS->getType()),
               Instruction::FCmp, pred, LHS, RHS, NameStr,
-              InsertAtEnd) {
+              &InsertAtEnd) {
     AssertOK();
   }
 
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 8dc843d2eaf604..2df3c9049c7d62 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -206,12 +206,12 @@ bool FoldBranchToCommonDest(BranchInst *BI, llvm::DomTreeUpdater *DTU = nullptr,
 /// to create a stack slot for X.
 AllocaInst *DemoteRegToStack(Instruction &X,
                              bool VolatileLoads = false,
-                             std::optional<BasicBlock::iterator> AllocaPoint = std::nullopt);
+                             Instruction *AllocaPoint = nullptr);
 
 /// This function takes a virtual register computed by a phi node and replaces
 /// it with a slot in the stack frame, allocated via alloca. The phi node is
 /// deleted and it returns the pointer to the alloca inserted.
-AllocaInst *DemotePHIToStack(PHINode *P, std::optional<BasicBlock::iterator> AllocaPoint = std::nullopt);
+AllocaInst *DemotePHIToStack(PHINode *P, Instruction *AllocaPoint = nullptr);
 
 /// If the specified pointer points to an object that we control, try to modify
 /// the object's alignment to PrefAlign. Returns a minimum known alignment of
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index c55d6cff84ed55..42cdcad78228f6 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -4608,10 +4608,10 @@ CmpInst *
 CmpInst::Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2,
                 const Twine &Name, BasicBlock *InsertAtEnd) {
   if (Op == Instruction::ICmp) {
-    return new ICmpInst(InsertAtEnd, CmpInst::Predicate(predicate),
+    return new ICmpInst(*InsertAtEnd, CmpInst::Predicate(predicate),
                         S1, S2, Name);
   }
-  return new FCmpInst(InsertAtEnd, CmpInst::Predicate(predicate),
+  return new FCmpInst(*InsertAtEnd, CmpInst::Predicate(predicate),
                       S1, S2, Name);
 }
 
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index 32326b5801031f..4d4306576017be 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -219,7 +219,7 @@ void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) {
   // Create a copy of the call/invoke instruction and add the new bundle.
   assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) &&
          "Unknown indirect call type");
-  CallBase *NewCB = CallBase::Create(CB, Bundles, CB->getIterator());
+  CallBase *NewCB = CallBase::Create(CB, Bundles, CB);
 
   // Change the target of the call to be the guard dispatch function.
   NewCB->setCalledOperand(GuardDispatchLoad);
diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index ebc5075aa36fe8..6c2b3e9bd4a721 100644
--- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -64,7 +64,7 @@ static bool runPass(Function &F) {
 
   CastInst *AllocaInsertionPoint = new BitCastInst(
       Constant::getNullValue(Type::getInt32Ty(F.getContext())),
-      Type::getInt32Ty(F.getContext()), "reg2mem alloca point", I);
+      Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);
 
   // Find the escaped instructions. But don't create stack slots for
   // allocas in entry block.
@@ -76,7 +76,7 @@ static bool runPass(Function &F) {
   // Demote escaped instructions
   NumRegsDemoted += WorkList.size();
   for (Instruction *I : WorkList)
-    DemoteRegToStack(*I, false, AllocaInsertionPoint->getIterator());
+    DemoteRegToStack(*I, false, AllocaInsertionPoint);
 
   WorkList.clear();
 
@@ -88,7 +88,7 @@ static bool runPass(Function &F) {
   // Demote phi nodes
   NumPhisDemoted += WorkList.size();
   for (Instruction *I : WorkList)
-    DemotePHIToStack(cast<PHINode>(I), AllocaInsertionPoint->getIterator());
+    DemotePHIToStack(cast<PHINode>(I), AllocaInsertionPoint);
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 5aa59acfa6df99..5bb109a04ff178 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1297,7 +1297,7 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
     // PHI.
     // Create the new PHI node, insert it into NewBB at the end of the block
     PHINode *NewPHI =
-        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI->getIterator());
+        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
 
     // NOTE! This loop walks backwards for a reason! First off, this minimizes
     // the cost of removal if we end up removing a large number of values, and
@@ -1517,7 +1517,7 @@ static void SplitLandingPadPredecessorsImpl(
       assert(!LPad->getType()->isTokenTy() &&
              "Split cannot be applied if LPad is token type. Otherwise an "
              "invalid PHINode of token type would be created.");
-      PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad->getIterator());
+      PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
       PN->addIncoming(Clone1, NewBB1);
       PN->addIncoming(Clone2, NewBB2);
       LPad->replaceAllUsesWith(PN);
@@ -1904,7 +1904,7 @@ static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
     auto Phi = cast<PHINode>(I);
     auto NewPhi =
         PHINode::Create(Phi->getType(), Incoming.size(),
-                        Phi->getName() + ".moved", FirstGuardBlock->begin());
+                        Phi->getName() + ".moved", &FirstGuardBlock->front());
     for (auto *In : Incoming) {
       Value *V = UndefValue::get(Phi->getType());
       if (In == Out) {
@@ -2023,7 +2023,7 @@ static void calcPredicateUsingInteger(
       Value *Id1 = ConstantInt::get(Type::getInt32Ty(Context),
                                     std::distance(Outgoing.begin(), Succ1Iter));
       IncomingId = SelectInst::Create(Condition, Id0, Id1, "target.bb.idx",
-                                      In->getTerminator()->getIterator());
+                                      In->getTerminator());
     } else {
       // Get the index of the non-null successor.
       auto SuccIter = Succ0 ? find(Outgoing, Succ0) : find(Outgoing, Succ1);
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 4606514cbc7175..4a784c11c611d9 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -420,7 +420,7 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
     // (b) Leave that as the only edge in the "Indirect" PHI.
     // (c) Merge the two in the body block.
     BasicBlock::iterator Indirect = Target->begin(),
-                         End = Target->getFirstNonPHIIt();
+                         End = Target->getFirstNonPHI()->getIterator();
     BasicBlock::iterator Direct = DirectSucc->begin();
     BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt();
 
@@ -430,7 +430,6 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
     while (Indirect != End) {
       PHINode *DirPHI = cast<PHINode>(Direct);
       PHINode *IndPHI = cast<PHINode>(Indirect);
-      BasicBlock::iterator InsertPt = Indirect;
 
       // Now, clean up - the direct block shouldn't get the indirect value,
       // and vice versa.
@@ -441,7 +440,7 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
       // PHI is erased.
       Indirect++;
 
-      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", InsertPt);
+      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI);
       NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred),
                              IBRPred);
 
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 48c33d6c0c8e0e..4e84927f1cfc90 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -168,12 +168,12 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 
   // Determine an appropriate location to create the bitcast for the return
   // value. The location depends on if we have a call or invoke instruction.
-  BasicBlock::iterator InsertBefore;
+  Instruction *InsertBefore = nullptr;
   if (auto *Invoke = dyn_cast<InvokeInst>(&CB))
     InsertBefore =
-        SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->begin();
+        &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front();
   else
-    InsertBefore = std::next(CB.getIterator());
+    InsertBefore = &*std::next(CB.getIterator());
 
   // Bitcast the return value to the correct type.
   auto *Cast = CastInst::CreateBitOrPointerCast(&CB, RetTy, "", InsertBefore);
@@ -509,7 +509,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
     Type *FormalTy = CalleeType->getParamType(ArgNo);
     Type *ActualTy = Arg->getType();
     if (FormalTy != ActualTy) {
-      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", CB.getIterator());
+      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", &CB);
       CB.setArgOperand(ArgNo, Cast);
 
       // Remove any incompatible attributes for the argument.
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 40010aee9c1114..282c4456346678 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -144,7 +144,7 @@ void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) {
   LLVM_DEBUG(dbgs() << "\tOperand: " << *U.get() << "\n");
 
   U.set(new FreezeInst(ValueToFr, ValueToFr->getName() + ".frozen",
-                       PH->getTerminator()->getIterator()));
+                       PH->getTerminator()));
 
   SE.forgetValue(UserI);
 }
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 3071ec0c911321..bab065153f3efa 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -570,7 +570,7 @@ void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
       LLVMContext &Ctx = M->getContext();
       auto *Int8PtrTy = PointerType::getUnqual(Ctx);
       CastInst *CastI =
-          CastInst::CreatePointerCast(AI, Int8PtrTy, "lt.cast", I->getIterator());
+          CastInst::CreatePointerCast(AI, Int8PtrTy, "lt.cast", I);
       I->replaceUsesOfWith(I->getOperand(1), CastI);
     }
 
@@ -1024,7 +1024,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       Value *Idx[2];
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), aggIdx);
-      BasicBlock::iterator TI = newFunction->begin()->getTerminator()->getIterator();
+      Instruction *TI = newFunction->begin()->getTerminator();
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
           StructTy, &*AggAI, Idx, "gep_" + inputs[i]->getName(), TI);
       RewriteVal = new LoadInst(StructTy->getElementType(aggIdx), GEP,
@@ -1173,7 +1173,7 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
       AllocaInst *alloca =
         new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
                        nullptr, output->getName() + ".loc",
-                       codeReplacer->getParent()->front().begin());
+                       &codeReplacer->getParent()->front().front());
       ReloadOutputs.push_back(alloca);
       params.push_back(alloca);
       ++ScalarOutputArgNo;
@@ -1192,8 +1192,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
     StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
     Struct = new AllocaInst(
         StructArgTy, DL.getAllocaAddrSpace(), nullptr, "structArg",
-        AllocationBlock ? AllocationBlock->getFirstInsertionPt()
-                        : codeReplacer->getParent()->front().begin());
+        AllocationBlock ? &*AllocationBlock->getFirstInsertionPt()
+                        : &codeReplacer->getParent()->front().front());
 
     if (ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) {
       auto *StructSpaceCast = new AddrSpaceCastInst(
@@ -1358,8 +1358,9 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
     else
       InsertPt = std::next(OutI->getIterator());
 
-    assert((InsertPt->getFunction() == newFunction ||
-            Blocks.count(InsertPt->getParent())) &&
+    Instruction *InsertBefore = &*InsertPt;
+    assert((InsertBefore->getFunction() == newFunction ||
+            Blocks.count(InsertBefore->getParent())) &&
            "InsertPt should be in new function");
     if (AggregateArgs && StructValues.contains(outputs[i])) {
       assert(AggOutputArgBegin != newFunction->arg_end() &&
@@ -1370,8 +1371,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
       Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), aggIdx);
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
           StructArgTy, &*AggOutputArgBegin, Idx, "gep_" + outputs[i]->getName(),
-          InsertPt);
-      new StoreInst(outputs[i], GEP, InsertPt);
+          InsertBefore);
+      new StoreInst(outputs[i], GEP, InsertBefore);
       ++aggIdx;
       // Since there should be only one struct argument aggregating
       // all the output values, we shouldn't increment AggOutputArgBegin, which
@@ -1380,7 +1381,7 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
       assert(ScalarOutputArgBegin != newFunction->arg_end() &&
              "Number of scalar output arguments should match "
              "the number of defined values");
-      new StoreInst(outputs[i], &*ScalarOutputArgBegin, InsertPt);
+      new StoreInst(outputs[i], &*ScalarOutputArgBegin, InsertBefore);
       ++ScalarOutputArgBegin;
     }
   }
@@ -1395,15 +1396,15 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
 
     // Check if the function should return a value
     if (OldFnRetTy->isVoidTy()) {
-      ReturnInst::Create(Context, nullptr, TheSwitch->getIterator());  // Return void
+      ReturnInst::Create(Context, nullptr, TheSwitch);  // Return void
     } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
       // return what we have
-      ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch->getIterator());
+      ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch);
     } else {
       // Otherwise we must have code extracted an unwind or something, just
       // return whatever we want.
       ReturnInst::Create(Context,
-                         Constant::getNullValue(OldFnRetTy), TheSwitch->getIterator());
+                         Constant::getNullValue(OldFnRetTy), TheSwitch);
     }
 
     TheSwitch->eraseFromParent();
@@ -1411,12 +1412,12 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   case 1:
     // Only a single destination, change the switch into an unconditional
     // branch.
-    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getIterator());
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch);
     TheSwitch->eraseFromParent();
     break;
   case 2:
     BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2),
-                       call, TheSwitch->getIterator());
+                       call, TheSwitch);
     TheSwitch->eraseFromParent();
     break;
   default:
diff --git a/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
index b2a88eadd3dee8..c894afee68a27a 100644
--- a/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
 /// invalidating the SSA information for the value.  It returns the pointer to
 /// the alloca inserted to create a stack slot for I.
 AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
-                                   std::optional<BasicBlock::iterator> AllocaPoint) {
+                                   Instruction *AllocaPoint) {
   if (I.use_empty()) {
     I.eraseFromParent();
     return nullptr;
@@ -33,10 +33,10 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
   AllocaInst *Slot;
   if (AllocaPoint) {
     Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
-                          I.getName()+".reg2mem", *AllocaPoint);
+                          I.getName()+".reg2mem", AllocaPoint);
   } else {
     Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
-                          I.getName() + ".reg2mem", F->getEntryBlock().begin());
+                          I.getName() + ".reg2mem", &F->getEntryBlock().front());
   }
 
   // We cannot demote invoke instructions to the stack if their normal edge
@@ -73,7 +73,7 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
             // Insert the load into the predecessor block
             V = new LoadInst(I.getType(), Slot, I.getName() + ".reload",
                              VolatileLoads,
-                             PN->getIncomingBlock(i)->getTerminator()->getIterator());
+                             PN->getIncomingBlock(i)->getTerminator());
             Loads[PN->getIncomingBlock(i)] = V;
           }
           PN->setIncomingValue(i, V);
@@ -82,7 +82,7 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     } else {
       // If this is a normal instruction, just insert a load.
       Value *V = new LoadInst(I.getType(), Slot, I.getName() + ".reload",
-                              VolatileLoads, U->getIterator());
+                              VolatileLoads, U);
       U->replaceUsesOfWith(&I, V);
     }
   }
@@ -99,7 +99,7 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
         break;
     if (isa<CatchSwitchInst>(InsertPt)) {
       for (BasicBlock *Handler : successors(&*InsertPt))
-        new StoreInst(&I, Slot, Handler->getFirstInsertionPt());
+        new StoreInst(&I, Slot, &*Handler->getFirstInsertionPt());
       return Slot;
     }
   } else {
@@ -107,14 +107,14 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     InsertPt = II.getNormalDest()->getFirstInsertionPt();
   }
 
-  new StoreInst(&I, Slot, InsertPt);
+  new StoreInst(&I, Slot, &*InsertPt);
   return Slot;
 }
 
 /// DemotePHIToStack - This function takes a virtual register computed by a PHI
 /// node and replaces it with a slot in the stack frame allocated via alloca.
 /// The PHI node is deleted. It returns the pointer to the alloca inserted.
-AllocaInst *llvm::DemotePHIToStack(PHINode *P, std::optional<BasicBlock::iterator> AllocaPoint) {
+AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
   if (P->use_empty()) {
     P->eraseFromParent();
     return nullptr;
@@ -126,12 +126,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, std::optional<BasicBlock::iterato
   AllocaInst *Slot;
   if (AllocaPoint) {
     Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
-                          P->getName()+".reg2mem", *AllocaPoint);
+                          P->getName()+".reg2mem", AllocaPoint);
   } else {
     Function *F = P->getParent()->getParent();
     Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
                           P->getName() + ".reg2mem",
-                          F->getEntryBlock().begin());
+                          &F->getEntryBlock().front());
   }
 
   // Iterate over each operand inserting a store in each predecessor.
@@ -141,7 +141,7 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, std::optional<BasicBlock::iterato
              "Invoke edge not supported yet"); (void)II;
     }
     new StoreInst(P->getIncomingValue(i), Slot,
-                  P->getIncomingBlock(i)->getTerminator()->getIterator());
+                  P->getIncomingBlock(i)->getTerminator());
   }
 
   // Insert a load in place of the PHI and replace all uses.
@@ -159,12 +159,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, std::optional<BasicBlock::iterato
     }
     for (Instruction *User : Users) {
       Value *V =
-          new LoadInst(P->getType(), Slot, P->getName() + ".reload", User->getIterator());
+          new LoadInst(P->getType(), Slot, P->getName() + ".reload", User);
       User->replaceUsesOfWith(P, V);
     }
   } else {
     Value *V =
-        new LoadInst(P->getType(), Slot, P->getName() + ".reload", InsertPt);
+        new LoadInst(P->getType(), Slot, P->getName() + ".reload", &*InsertPt);
     P->replaceAllUsesWith(V);
   }
   // Delete PHI.
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index f4207474e9a68a..092f1799755d17 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -20,7 +20,7 @@
 using namespace llvm;
 
 static void insertCall(Function &CurFn, StringRef Func,
-                       BasicBlock::iterator InsertionPt, DebugLoc DL) {
+                       Instruction *InsertionPt, DebugLoc DL) {
   Module &M = *InsertionPt->getParent()->getParent()->getParent();
   LLVMContext &C = InsertionPt->getParent()->getContext();
 
@@ -105,7 +105,7 @@ static bool runOnFunction(Function &F, bool PostInlining) {
     if (auto SP = F.getSubprogram())
       DL = DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
 
-    insertCall(F, EntryFunc, F.begin()->getFirstInsertionPt(), DL);
+    insertCall(F, EntryFunc, &*F.begin()->getFirstInsertionPt(), DL);
     Changed = true;
     F.removeFnAttr(EntryAttr);
   }
@@ -126,7 +126,7 @@ static bool runOnFunction(Function &F, bool PostInlining) {
       else if (auto SP = F.getSubprogram())
         DL = DILocation::get(SP->getContext(), 0, 0, SP);
 
-      insertCall(F, ExitFunc, T->getIterator(), DL);
+      insertCall(F, ExitFunc, T, DL);
       Changed = true;
     }
     F.removeFnAttr(ExitAttr);
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 0e8e72678de613..f68fdb26f28173 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -689,7 +689,7 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
     if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
       if (CRI->unwindsToCaller()) {
         auto *CleanupPad = CRI->getCleanupPad();
-        CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI->getIterator());
+        CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI);
         CRI->eraseFromParent();
         UpdatePHINodes(&*BB);
         // Finding a cleanupret with an unwind destination would confuse
@@ -737,7 +737,7 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
         auto *NewCatchSwitch = CatchSwitchInst::Create(
             CatchSwitch->getParentPad(), UnwindDest,
             CatchSwitch->getNumHandlers(), CatchSwitch->getName(),
-            CatchSwitch->getIterator());
+            CatchSwitch);
         for (BasicBlock *PadBB : CatchSwitch->handlers())
           NewCatchSwitch->addHandler(PadBB);
         // Propagate info for the old catchswitch over to the new one in
@@ -972,7 +972,7 @@ static void PropagateOperandBundles(Function::iterator InlinedBB,
     I->getOperandBundlesAsDefs(OpBundles);
     OpBundles.emplace_back("funclet", CallSiteEHPad);
 
-    Instruction *NewInst = CallBase::Create(I, OpBundles, I->getIterator());
+    Instruction *NewInst = CallBase::Create(I, OpBundles, I);
     NewInst->takeName(I);
     I->replaceAllUsesWith(NewInst);
     I->eraseFromParent();
@@ -2002,7 +2002,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
       Value *BundleArgs[] = {*objcarc::getAttachedARCFunction(&CB)};
       OperandBundleDef OB("clang.arc.attachedcall", BundleArgs);
       auto *NewCall = CallBase::addOperandBundle(
-          CI, LLVMContext::OB_clang_arc_attachedcall, OB, CI->getIterator());
+          CI, LLVMContext::OB_clang_arc_attachedcall, OB, CI);
       NewCall->copyMetadata(*CI);
       CI->replaceAllUsesWith(NewCall);
       CI->eraseFromParent();
@@ -2326,7 +2326,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
           OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
         }
 
-        Instruction *NewI = CallBase::Create(ICS, OpDefs, ICS->getIterator());
+        Instruction *NewI = CallBase::Create(ICS, OpDefs, ICS);
 
         // Note: the RAUW does the appropriate fixup in VMap, so we need to do
         // this even if the call returns void.
@@ -2479,7 +2479,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
           SmallVector<Value *, 6> Params(CI->args());
           Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
           CallInst *NewCI = CallInst::Create(
-              CI->getFunctionType(), CI->getCalledOperand(), Params, "", CI->getIterator());
+              CI->getFunctionType(), CI->getCalledOperand(), Params, "", CI);
           NewCI->setDebugLoc(CI->getDebugLoc());
           NewCI->setAttributes(Attrs);
           NewCI->setCallingConv(CI->getCallingConv());
@@ -2776,7 +2776,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     // If the call site was an invoke instruction, add a branch to the normal
     // destination.
     if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
-      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), CB.getIterator());
+      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), &CB);
       NewBr->setDebugLoc(Returns[0]->getDebugLoc());
     }
 
@@ -2813,7 +2813,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
 
     // Add an unconditional branch to make this look like the CallInst case...
-    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), CB.getIterator());
+    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), &CB);
 
     // Split the basic block.  This guarantees that no PHI nodes will have to be
     // updated due to new incoming edges, and make the invoke case more
@@ -2881,7 +2881,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     DebugLoc Loc;
     for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
       ReturnInst *RI = Returns[i];
-      BranchInst* BI = BranchInst::Create(AfterCallBB, RI->getIterator());
+      BranchInst* BI = BranchInst::Create(AfterCallBB, RI);
       Loc = RI->getDebugLoc();
       BI->setDebugLoc(Loc);
       RI->eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index c4a8843f2840b3..075eeb5b19fd2b 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2811,7 +2811,7 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
     if (DTU)
       UniqueSuccessors.insert(Successor);
   }
-  auto *UI = new UnreachableInst(I->getContext(), I->getIterator());
+  auto *UI = new UnreachableInst(I->getContext(), I);
   UI->setDebugLoc(I->getDebugLoc());
 
   // All instructions after this are dead.
@@ -2868,7 +2868,7 @@ CallInst *llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
 
   // Follow the call by a branch to the normal destination.
   BasicBlock *NormalDestBB = II->getNormalDest();
-  BranchInst::Create(NormalDestBB, II->getIterator());
+  BranchInst::Create(NormalDestBB, II);
 
   // Update PHI nodes in the unwind destination
   BasicBlock *BB = II->getParent();
@@ -3048,7 +3048,7 @@ static bool markAliveBlocks(Function &F,
             // jump to the normal destination branch.
             BasicBlock *NormalDestBB = II->getNormalDest();
             BasicBlock *UnwindDestBB = II->getUnwindDest();
-            BranchInst::Create(NormalDestBB, II->getIterator());
+            BranchInst::Create(NormalDestBB, II);
             UnwindDestBB->removePredecessor(II->getParent());
             II->eraseFromParent();
             if (DTU)
@@ -3131,12 +3131,12 @@ Instruction *llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
   BasicBlock *UnwindDest;
 
   if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
-    NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI->getIterator());
+    NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI);
     UnwindDest = CRI->getUnwindDest();
   } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
     auto *NewCatchSwitch = CatchSwitchInst::Create(
         CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(),
-        CatchSwitch->getName(), CatchSwitch->getIterator());
+        CatchSwitch->getName(), CatchSwitch);
     for (BasicBlock *PadBB : CatchSwitch->handlers())
       NewCatchSwitch->addHandler(PadBB);
 
@@ -3972,23 +3972,23 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
   // We may need to truncate the provider.
   if (DemandedTy != Provider->getType()) {
     auto *Trunc =
-        CastInst::CreateIntegerCast(Provider, DemandedTy, false, "trunc", I->getIterator());
+        CastInst::CreateIntegerCast(Provider, DemandedTy, false, "trunc", I);
     InsertedInsts.push_back(Trunc);
     Provider = Trunc;
   }
 
-  Instruction *Result = CallInst::Create(F, Provider, "rev", I->getIterator());
+  Instruction *Result = CallInst::Create(F, Provider, "rev", I);
   InsertedInsts.push_back(Result);
 
   if (!DemandedMask.isAllOnes()) {
     auto *Mask = ConstantInt::get(DemandedTy, DemandedMask);
-    Result = BinaryOperator::Create(Instruction::And, Result, Mask, "mask", I->getIterator());
+    Result = BinaryOperator::Create(Instruction::And, Result, Mask, "mask", I);
     InsertedInsts.push_back(Result);
   }
 
   // We may need to zeroextend back to the result type.
   if (ITy != Result->getType()) {
-    auto *ExtInst = CastInst::CreateIntegerCast(Result, ITy, false, "zext", I->getIterator());
+    auto *ExtInst = CastInst::CreateIntegerCast(Result, ITy, false, "zext", I);
     InsertedInsts.push_back(ExtInst);
   }
 
diff --git a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
index 81545ef375219c..ea6d952cfa7d4f 100644
--- a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
+++ b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
@@ -644,7 +644,7 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
   // value of the same PHI nodes if/when we continue execution.
   for (PHINode &PN : LS.Header->phis()) {
     PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy",
-                                      BranchToContinuation->getIterator());
+                                      BranchToContinuation);
 
     NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader);
     NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch),
@@ -653,7 +653,7 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
   }
 
   RRI.IndVarEnd = PHINode::Create(IndVarBase->getType(), 2, "indvar.end",
-                                  BranchToContinuation->getIterator());
+                                  BranchToContinuation);
   RRI.IndVarEnd->addIncoming(IndVarStart, Preheader);
   RRI.IndVarEnd->addIncoming(IndVarBase, RRI.ExitSelector);
 
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index cec47810e04447..adae796ee02997 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -874,7 +874,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       // We can fold the conditional branch in the preheader, this makes things
       // simpler. The first step is to remove the extra edge to the Exit block.
       Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-      BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
+      BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
       NewBI->setDebugLoc(PHBI->getDebugLoc());
       PHBI->eraseFromParent();
 
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index b38fecd6d8b416..07e622b1577fe1 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -399,7 +399,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
     PHINode *PN = cast<PHINode>(I);
     PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(),
-                                     PN->getName()+".be", BETerminator->getIterator());
+                                     PN->getName()+".be", BETerminator);
 
     // Loop over the PHI node, moving all entries except the one for the
     // preheader over to the new PHI node.
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 6f0d000815726e..ee6f7b35750af0 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -721,7 +721,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     DeadSucc->removePredecessor(Src, /* KeepOneInputPHIs */ true);
 
     // Replace the conditional branch with an unconditional one.
-    BranchInst::Create(Dest, Term->getIterator());
+    BranchInst::Create(Dest, Term);
     Term->eraseFromParent();
 
     DTUpdates.emplace_back(DominatorTree::Delete, Src, DeadSucc);
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index c7b88d3c48a69f..26b8c790f2a062 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -522,7 +522,7 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
     // unconditional one to this one
     BranchInst *SubTerm =
         cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
-    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm->getIterator());
+    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
     SubTerm->eraseFromParent();
 
     SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],
@@ -535,7 +535,7 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   // Aft blocks successors and phis
   BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
   if (CompletelyUnroll) {
-    BranchInst::Create(LoopExit, AftTerm->getIterator());
+    BranchInst::Create(LoopExit, AftTerm);
     AftTerm->eraseFromParent();
   } else {
     AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
@@ -550,7 +550,7 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
     // unconditional one to this one
     BranchInst *AftTerm =
         cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
-    BranchInst::Create(AftBlocksFirst[It], AftTerm->getIterator());
+    BranchInst::Create(AftBlocksFirst[It], AftTerm);
     AftTerm->eraseFromParent();
 
     AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
diff --git a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
index 791eeed9cb62cd..4908535cba5411 100644
--- a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
+++ b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
@@ -207,7 +207,7 @@ static bool runImpl(Module &M) {
       Value *Null = ConstantPointerNull::get(VoidStar);
       Value *Args[] = {CallDtors, Null, DsoHandle};
       Value *Res = CallInst::Create(AtExit, Args, "call", EntryBB);
-      Value *Cmp = new ICmpInst(EntryBB, ICmpInst::ICMP_NE, Res,
+      Value *Cmp = new ICmpInst(*EntryBB, ICmpInst::ICMP_NE, Res,
                                 Constant::getNullValue(Res->getType()));
       BranchInst::Create(FailBB, RetBB, Cmp, EntryBB);
 
diff --git a/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index ff2ab3c6dce9c8..22ff3c2e5848fc 100644
--- a/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -52,7 +52,7 @@ static bool runImpl(Function &F) {
       // Insert a normal call instruction...
       CallInst *NewCall =
           CallInst::Create(II->getFunctionType(), II->getCalledOperand(),
-                           CallArgs, OpBundles, "", II->getIterator());
+                           CallArgs, OpBundles, "", II);
       NewCall->takeName(II);
       NewCall->setCallingConv(II->getCallingConv());
       NewCall->setAttributes(II->getAttributes());
@@ -60,7 +60,7 @@ static bool runImpl(Function &F) {
       II->replaceAllUsesWith(NewCall);
 
       // Insert an unconditional branch to the normal destination.
-      BranchInst::Create(II->getNormalDest(), II->getIterator());
+      BranchInst::Create(II->getNormalDest(), II);
 
       // Remove any PHI node entries from the exception destination.
       II->getUnwindDest()->removePredecessor(&BB);
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index acd3f2802031ea..88934a34d2e2a0 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -384,10 +384,10 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
   // structure. Its block terminators (unconditional branches) are replaced by
   // the appropriate conditional branches when the loop is built.
-  ICmpInst *PtrCompare = new ICmpInst(InsertBefore->getIterator(), ICmpInst::ICMP_ULT,
+  ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
                                       SrcAddr, DstAddr, "compare_src_dst");
   Instruction *ThenTerm, *ElseTerm;
-  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(), &ThenTerm,
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
                                 &ElseTerm);
 
   // Each part of the function consists of two blocks:
@@ -409,7 +409,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
   // between both backwards and forward copy clauses.
   ICmpInst *CompareN =
-      new ICmpInst(OrigBB->getTerminator()->getIterator(), ICmpInst::ICMP_EQ, CopyLen,
+      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
                    ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
 
   // Copying backwards.
@@ -431,7 +431,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
       ExitBB, LoopBB);
   LoopPhi->addIncoming(IndexPtr, LoopBB);
   LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
-  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm->getIterator());
+  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
   ThenTerm->eraseFromParent();
 
   // Copying forward.
@@ -451,7 +451,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
   FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
 
-  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm->getIterator());
+  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
   ElseTerm->eraseFromParent();
 }
 
diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index f5921e5ccb0914..4131d36b572d74 100644
--- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -165,20 +165,20 @@ BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
   if (Leaf.Low == Leaf.High) {
     // Make the seteq instruction...
     Comp =
-        new ICmpInst(NewLeaf, ICmpInst::ICMP_EQ, Val, Leaf.Low, "SwitchLeaf");
+        new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val, Leaf.Low, "SwitchLeaf");
   } else {
     // Make range comparison
     if (Leaf.Low == LowerBound) {
       // Val >= Min && Val <= Hi --> Val <= Hi
-      Comp = new ICmpInst(NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High,
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High,
                           "SwitchLeaf");
     } else if (Leaf.High == UpperBound) {
       // Val <= Max && Val >= Lo --> Val >= Lo
-      Comp = new ICmpInst(NewLeaf, ICmpInst::ICMP_SGE, Val, Leaf.Low,
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SGE, Val, Leaf.Low,
                           "SwitchLeaf");
     } else if (Leaf.Low->isZero()) {
       // Val >= 0 && Val <= Hi --> Val <=u Hi
-      Comp = new ICmpInst(NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
                           "SwitchLeaf");
     } else {
       // Emit V-Lo <=u Hi-Lo
@@ -186,7 +186,7 @@ BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
       Instruction *Add = BinaryOperator::CreateAdd(
           Val, NegLo, Val->getName() + ".off", NewLeaf);
       Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High);
-      Comp = new ICmpInst(NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound,
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound,
                           "SwitchLeaf");
     }
   }
diff --git a/llvm/lib/Transforms/Utils/MatrixUtils.cpp b/llvm/lib/Transforms/Utils/MatrixUtils.cpp
index 7866d6434c1156..e218773cf5da15 100644
--- a/llvm/lib/Transforms/Utils/MatrixUtils.cpp
+++ b/llvm/lib/Transforms/Utils/MatrixUtils.cpp
@@ -36,7 +36,7 @@ BasicBlock *TileInfo::CreateLoop(BasicBlock *Preheader, BasicBlock *Exit,
   BranchInst::Create(Body, Header);
   BranchInst::Create(Latch, Body);
   PHINode *IV =
-      PHINode::Create(I32Ty, 2, Name + ".iv", Header->getTerminator()->getIterator());
+      PHINode::Create(I32Ty, 2, Name + ".iv", Header->getTerminator());
   IV->addIncoming(ConstantInt::get(I32Ty, 0), Preheader);
 
   B.SetInsertPoint(Latch);
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 2ffe89a2458405..2ff7c015107677 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -219,7 +219,7 @@ void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
   Type *PaddingType = ArrayType::get(Type::getInt8Ty(Ctx), AlignedSize - Size);
   Type *TypeWithPadding = StructType::get(AllocatedType, PaddingType);
   auto *NewAI = new AllocaInst(TypeWithPadding, Info.AI->getAddressSpace(),
-                               nullptr, "", Info.AI->getIterator());
+                               nullptr, "", Info.AI);
   NewAI->takeName(Info.AI);
   NewAI->setAlignment(Info.AI->getAlign());
   NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
@@ -230,7 +230,7 @@ void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
 
   // TODO: Remove when typed pointers dropped
   if (Info.AI->getType() != NewAI->getType())
-    NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI->getIterator());
+    NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI);
 
   Info.AI->replaceAllUsesWith(NewPtr);
   Info.AI->eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index a185e8cd371c60..3dc6016a0a373f 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -177,7 +177,7 @@ static bool replaceSignedInst(SCCPSolver &Solver,
     Value *Op0 = Inst.getOperand(0);
     if (InsertedValues.count(Op0) || !isNonNegative(Op0))
       return false;
-    NewInst = new ZExtInst(Op0, Inst.getType(), "", Inst.getIterator());
+    NewInst = new ZExtInst(Op0, Inst.getType(), "", &Inst);
     NewInst->setNonNeg();
     break;
   }
@@ -186,7 +186,7 @@ static bool replaceSignedInst(SCCPSolver &Solver,
     Value *Op0 = Inst.getOperand(0);
     if (InsertedValues.count(Op0) || !isNonNegative(Op0))
       return false;
-    NewInst = BinaryOperator::CreateLShr(Op0, Inst.getOperand(1), "", Inst.getIterator());
+    NewInst = BinaryOperator::CreateLShr(Op0, Inst.getOperand(1), "", &Inst);
     NewInst->setIsExact(Inst.isExact());
     break;
   }
@@ -199,7 +199,7 @@ static bool replaceSignedInst(SCCPSolver &Solver,
       return false;
     auto NewOpcode = Inst.getOpcode() == Instruction::SDiv ? Instruction::UDiv
                                                            : Instruction::URem;
-    NewInst = BinaryOperator::Create(NewOpcode, Op0, Op1, "", Inst.getIterator());
+    NewInst = BinaryOperator::Create(NewOpcode, Op0, Op1, "", &Inst);
     if (Inst.getOpcode() == Instruction::SDiv)
       NewInst->setIsExact(Inst.isExact());
     break;
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 3a3bcde7c3dcdd..0f67cc3ff4faca 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1267,7 +1267,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
         // corresponding to the back-edge.
         Instruction *Add = BinaryOperator::CreateAdd(CanonicalIV, One,
                                                      "indvar.next",
-                                                     HP->getTerminator()->getIterator());
+                                                     HP->getTerminator());
         Add->setDebugLoc(HP->getTerminator()->getDebugLoc());
         rememberInstruction(Add);
         CanonicalIV->addIncoming(Add, HP);
@@ -2232,7 +2232,7 @@ Value *SCEVExpander::fixupLCSSAFormFor(Value *V) {
   if (!PreserveLCSSA || !DefI)
     return V;
 
-  BasicBlock::iterator InsertPt = Builder.GetInsertPoint();
+  Instruction *InsertPt = &*Builder.GetInsertPoint();
   Loop *DefLoop = SE.LI.getLoopFor(DefI->getParent());
   Loop *UseLoop = SE.LI.getLoopFor(InsertPt->getParent());
   if (!DefLoop || UseLoop == DefLoop || DefLoop->contains(UseLoop))
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index fe65e52110f97a..bbdbfbbf776f32 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2710,7 +2710,7 @@ static void MergeCompatibleInvokesImpl(ArrayRef<InvokeInst *> Invokes,
 
     // Form a PHI out of all the data ops under this index.
     PHINode *PN = PHINode::Create(
-        U->getType(), /*NumReservedValues=*/Invokes.size(), "", MergedInvoke->getIterator());
+        U->getType(), /*NumReservedValues=*/Invokes.size(), "", MergedInvoke);
     for (InvokeInst *II : Invokes)
       PN->addIncoming(II->getOperand(U.getOperandNo()), II->getParent());
 
@@ -4663,7 +4663,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
     // terminator must be unreachable.
-    new UnreachableInst(OldTerm->getContext(), OldTerm->getIterator());
+    new UnreachableInst(OldTerm->getContext(), OldTerm);
   } else {
     // One of the selected values was a successor, but the other wasn't.
     // Insert an unconditional branch to the one that was found;
@@ -5353,7 +5353,7 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
       // or a degenerate conditional branch with matching destinations.
       if (all_of(BI->successors(),
                  [BB](auto *Successor) { return Successor == BB; })) {
-        new UnreachableInst(TI->getContext(), TI->getIterator());
+        new UnreachableInst(TI->getContext(), TI);
         TI->eraseFromParent();
         Changed = true;
       } else {
@@ -5452,7 +5452,7 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
             removeUnwindEdge(EHPred, DTU);
         }
         // The catchswitch is no longer reachable.
-        new UnreachableInst(CSI->getContext(), CSI->getIterator());
+        new UnreachableInst(CSI->getContext(), CSI);
         CSI->eraseFromParent();
         Changed = true;
       }
@@ -5462,7 +5462,7 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
              "Expected to always have an unwind to BB.");
       if (DTU)
         Updates.push_back({DominatorTree::Delete, Predecessor, BB});
-      new UnreachableInst(TI->getContext(), TI->getIterator());
+      new UnreachableInst(TI->getContext(), TI);
       TI->eraseFromParent();
       Changed = true;
     }
@@ -6604,7 +6604,7 @@ static void reuseTableCompare(
     // The compare yields the same result, just inverted. We can replace it.
     Value *InvertedTableCmp = BinaryOperator::CreateXor(
         RangeCmp, ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
-        RangeCheckBranch->getIterator());
+        RangeCheckBranch);
     CmpInst->replaceAllUsesWith(InvertedTableCmp);
     ++NumTableCmpReuses;
   }
@@ -7155,14 +7155,14 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
 
   if (IBI->getNumDestinations() == 0) {
     // If the indirectbr has no successors, change it to unreachable.
-    new UnreachableInst(IBI->getContext(), IBI->getIterator());
+    new UnreachableInst(IBI->getContext(), IBI);
     EraseTerminatorAndDCECond(IBI);
     return true;
   }
 
   if (IBI->getNumDestinations() == 1) {
     // If the indirectbr has one successor, change it to a direct branch.
-    BranchInst::Create(IBI->getDestination(0), IBI->getIterator());
+    BranchInst::Create(IBI->getDestination(0), IBI);
     EraseTerminatorAndDCECond(IBI);
     return true;
   }
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index b8fa985fa3462e..297cfe5124d85d 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -301,7 +301,7 @@ bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
   if (SE->isKnownNonNegative(N) && SE->isKnownNonNegative(D)) {
     auto *UDiv = BinaryOperator::Create(
         BinaryOperator::UDiv, SDiv->getOperand(0), SDiv->getOperand(1),
-        SDiv->getName() + ".udiv", SDiv->getIterator());
+        SDiv->getName() + ".udiv", SDiv);
     UDiv->setIsExact(SDiv->isExact());
     SDiv->replaceAllUsesWith(UDiv);
     LLVM_DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
@@ -318,7 +318,7 @@ bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
 void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
   auto *N = Rem->getOperand(0), *D = Rem->getOperand(1);
   auto *URem = BinaryOperator::Create(BinaryOperator::URem, N, D,
-                                      Rem->getName() + ".urem", Rem->getIterator());
+                                      Rem->getName() + ".urem", Rem);
   Rem->replaceAllUsesWith(URem);
   LLVM_DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
   ++NumSimplifiedSRem;
@@ -339,9 +339,9 @@ void SimplifyIndvar::replaceRemWithNumerator(BinaryOperator *Rem) {
 void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) {
   auto *T = Rem->getType();
   auto *N = Rem->getOperand(0), *D = Rem->getOperand(1);
-  ICmpInst *ICmp = new ICmpInst(Rem->getIterator(), ICmpInst::ICMP_EQ, N, D);
+  ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, N, D);
   SelectInst *Sel =
-      SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem->getIterator());
+      SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem);
   Rem->replaceAllUsesWith(Sel);
   LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
   ++NumElimRem;
@@ -411,7 +411,7 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(WithOverflowInst *WO) {
   // intrinsic as well.
 
   BinaryOperator *NewResult = BinaryOperator::Create(
-      WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), "", WO->getIterator());
+      WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), "", WO);
 
   if (WO->isSigned())
     NewResult->setHasNoSignedWrap(true);
@@ -449,7 +449,7 @@ bool SimplifyIndvar::eliminateSaturatingIntrinsic(SaturatingInst *SI) {
     return false;
 
   BinaryOperator *BO = BinaryOperator::Create(
-      SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI->getIterator());
+      SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI);
   if (SI->isSigned())
     BO->setHasNoSignedWrap();
   else
@@ -1767,7 +1767,7 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
 
         PHINode *WidePhi =
           PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
-                          UsePhi->getIterator());
+                          UsePhi);
         WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
         BasicBlock *WidePhiBB = WidePhi->getParent();
         IRBuilder<> Builder(WidePhiBB, WidePhiBB->getFirstInsertionPt());
diff --git a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
index 3ae76ffd5ecab4..6094f36a77f45c 100644
--- a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -42,7 +42,7 @@ static bool stripGCRelocates(Function &F) {
     // All gc_relocates are i8 addrspace(1)* typed, we need a bitcast from i8
     // addrspace(1)* to the type of the OrigPtr, if the are not the same.
     if (GCRel->getType() != OrigPtr->getType())
-      ReplaceGCRel = new BitCastInst(OrigPtr, GCRel->getType(), "cast", GCRel->getIterator());
+      ReplaceGCRel = new BitCastInst(OrigPtr, GCRel->getType(), "cast", GCRel);
 
     // Replace all uses of gc.relocate and delete the gc.relocate
     // There maybe unncessary bitcasts back to the OrigPtr type, an instcombine
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 1d51f61351fe27..2f37f7f972cbbf 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -119,7 +119,7 @@ static void restoreSSA(const DominatorTree &DT, const Loop *L,
     LLVM_DEBUG(dbgs() << "externally used: " << Def->getName() << "\n");
     auto NewPhi =
         PHINode::Create(Def->getType(), Incoming.size(),
-                        Def->getName() + ".moved", LoopExitBlock->begin());
+                        Def->getName() + ".moved", &LoopExitBlock->front());
     for (auto *In : Incoming) {
       LLVM_DEBUG(dbgs() << "predecessor " << In->getName() << ": ");
       if (Def->getParent() == In || DT.dominates(Def, In)) {
diff --git a/llvm/tools/bugpoint/Miscompilation.cpp b/llvm/tools/bugpoint/Miscompilation.cpp
index b165b8220c20bd..22806bab83eecf 100644
--- a/llvm/tools/bugpoint/Miscompilation.cpp
+++ b/llvm/tools/bugpoint/Miscompilation.cpp
@@ -884,7 +884,7 @@ CleanupAndPrepareModules(BugDriver &BD, std::unique_ptr<Module> Test,
           // Check to see if we already looked up the value.
           Value *CachedVal =
               new LoadInst(F->getType(), Cache, "fpcache", EntryBB);
-          Value *IsNull = new ICmpInst(EntryBB, ICmpInst::ICMP_EQ, CachedVal,
+          Value *IsNull = new ICmpInst(*EntryBB, ICmpInst::ICMP_EQ, CachedVal,
                                        NullPtr, "isNull");
           BranchInst::Create(LookupBB, DoCallBB, IsNull, EntryBB);
 

From 6afda56faa6260cff4e6e9264226737d96d952c1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 29 Feb 2024 08:55:52 -0800
Subject: [PATCH 42/55] [RISCV] Store RVC and TSO ELF flags explicitly in
 RISCVTargetStreamer. NFCI (#83344)

Instead of caching STI in the RISCVELFTargetStreamer, store the two
flags we need from it.

My goal is to allow RISCVAsmPrinter to override these flags using IR
module metadata for LTO. So they need to be separated from the STI used
to construct the TargetStreamer.

This patch should be NFC as long as no one is changing the contents of
the STI that was used to construct the TargetStreamer between the
constructor and the use of the flags.
---
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp   | 8 ++++----
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h     | 1 -
 .../lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp | 6 ++++++
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h  | 5 +++++
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index b375e8bb4b8fac..cdf7c048a4bf11 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -31,12 +31,13 @@ using namespace llvm;
 // This part is for ELF object output.
 RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S,
                                                const MCSubtargetInfo &STI)
-    : RISCVTargetStreamer(S), CurrentVendor("riscv"), STI(STI) {
+    : RISCVTargetStreamer(S), CurrentVendor("riscv") {
   MCAssembler &MCA = getStreamer().getAssembler();
   const FeatureBitset &Features = STI.getFeatureBits();
   auto &MAB = static_cast<RISCVAsmBackend &>(MCA.getBackend());
   setTargetABI(RISCVABI::computeTargetABI(STI.getTargetTriple(), Features,
                                           MAB.getTargetOptions().getABIName()));
+  setFlagsFromFeatures(STI);
   // `j label` in `.option norelax; j label; .option relax; ...; label:` needs a
   // relocation to ensure the jump target is correct after linking. This is due
   // to a limitation that shouldForceRelocation has to make the decision upfront
@@ -91,10 +92,9 @@ void RISCVTargetELFStreamer::finish() {
 
   unsigned EFlags = MCA.getELFHeaderEFlags();
 
-  if (STI.hasFeature(RISCV::FeatureStdExtC) ||
-      STI.hasFeature(RISCV::FeatureStdExtZca))
+  if (hasRVC())
     EFlags |= ELF::EF_RISCV_RVC;
-  if (STI.hasFeature(RISCV::FeatureStdExtZtso))
+  if (hasTSO())
     EFlags |= ELF::EF_RISCV_TSO;
 
   switch (ABI) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index a6f54bf67b5d2b..e8f29cd8449ba0 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -46,7 +46,6 @@ class RISCVTargetELFStreamer : public RISCVTargetStreamer {
   StringRef CurrentVendor;
 
   MCSection *AttributeSection = nullptr;
-  const MCSubtargetInfo &STI;
 
   void emitAttribute(unsigned Attribute, unsigned Value) override;
   void emitTextAttribute(unsigned Attribute, StringRef String) override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 071a3a5aa5d6e7..4a4b1e13c2b9ec 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -48,6 +48,12 @@ void RISCVTargetStreamer::setTargetABI(RISCVABI::ABI ABI) {
   TargetABI = ABI;
 }
 
+void RISCVTargetStreamer::setFlagsFromFeatures(const MCSubtargetInfo &STI) {
+  HasRVC = STI.hasFeature(RISCV::FeatureStdExtC) ||
+           STI.hasFeature(RISCV::FeatureStdExtZca);
+  HasTSO = STI.hasFeature(RISCV::FeatureStdExtZtso);
+}
+
 void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI,
                                                bool EmitStackAlign) {
   if (EmitStackAlign) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
index 070e72fb157ae9..cb8bc21cb63557 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
@@ -33,6 +33,8 @@ struct RISCVOptionArchArg {
 
 class RISCVTargetStreamer : public MCTargetStreamer {
   RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
+  bool HasRVC = false;
+  bool HasTSO = false;
 
 public:
   RISCVTargetStreamer(MCStreamer &S);
@@ -58,6 +60,9 @@ class RISCVTargetStreamer : public MCTargetStreamer {
   void emitTargetAttributes(const MCSubtargetInfo &STI, bool EmitStackAlign);
   void setTargetABI(RISCVABI::ABI ABI);
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
+  void setFlagsFromFeatures(const MCSubtargetInfo &STI);
+  bool hasRVC() const { return HasRVC; }
+  bool hasTSO() const { return HasTSO; }
 };
 
 // This part is for ascii assembly output

From 2d61979137cfea8f016e618dd17f5be8e7d865bf Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Thu, 29 Feb 2024 08:54:00 -0800
Subject: [PATCH 43/55] [OpenACC] Add dependent test for break/continue compute
 construct diag

I discovered while debugging something else that this could possibly
cause an assert if I'm not careful with followup patches, so add the
tests.
---
 clang/test/SemaOpenACC/no-branch-in-out.cpp | 97 +++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/clang/test/SemaOpenACC/no-branch-in-out.cpp b/clang/test/SemaOpenACC/no-branch-in-out.cpp
index 232e372cedd357..9affdf733ace8d 100644
--- a/clang/test/SemaOpenACC/no-branch-in-out.cpp
+++ b/clang/test/SemaOpenACC/no-branch-in-out.cpp
@@ -15,3 +15,100 @@ void ReturnTest() {
     }
   }
 }
+
+template<typename T>
+void BreakContinue() {
+
+#pragma acc parallel
+  for(int i =0; i < 5; ++i) {
+    switch(i) {
+      case 0:
+      break; // leaves switch, not 'for'.
+      default:
+      i +=2;
+      break;
+    }
+    if (i == 2)
+      continue;
+
+    break;  // expected-error{{invalid branch out of OpenACC Compute Construct}}
+  }
+
+  int j;
+  switch(j) {
+    case 0:
+#pragma acc parallel
+    {
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
+    }
+    case 1:
+#pragma acc parallel
+    {
+    }
+    break;
+  }
+
+#pragma acc parallel
+  for(int i = 0; i < 5; ++i) {
+    if (i > 1)
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
+  }
+
+#pragma acc parallel
+  switch(j) {
+    case 1:
+      break;
+  }
+
+#pragma acc parallel
+  {
+    for(int i = 1; i < 100; i++) {
+      if (i > 4)
+        break;
+    }
+  }
+
+  for (int i =0; i < 5; ++i) {
+#pragma acc parallel
+    {
+      continue; // expected-error{{invalid branch out of OpenACC Compute Construct}}
+    }
+  }
+
+#pragma acc parallel
+  for (int i =0; i < 5; ++i) {
+    continue;
+  }
+
+#pragma acc parallel
+  for (int i =0; i < 5; ++i) {
+    {
+      continue;
+    }
+  }
+
+  for (int i =0; i < 5; ++i) {
+#pragma acc parallel
+    {
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
+    }
+  }
+
+#pragma acc parallel
+  while (j) {
+    --j;
+    if (j > 4)
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
+  }
+
+#pragma acc parallel
+  do {
+    --j;
+    if (j > 4)
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
+  } while (j );
+}
+
+void Instantiate() {
+  BreakContinue<int>();
+}

From 0699749cb45db77e8393f198b1837945a958aebf Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Thu, 29 Feb 2024 09:05:43 -0800
Subject: [PATCH 44/55] [flang][runtime] Moved support for some REAL(16)
 intrinsics to Float128Math. (#83383)

This adds support for 128-bit float versions of SCALE, NEAREST, MOD,
MODULO, SET_EXPONENT, EXPONENT, FRACTION, SPACING and RRSPACING.
---
 flang/runtime/Float128Math/CMakeLists.txt     |   9 +
 flang/runtime/Float128Math/acos.cpp           |   2 +-
 flang/runtime/Float128Math/acosh.cpp          |   2 +-
 flang/runtime/Float128Math/asin.cpp           |   2 +-
 flang/runtime/Float128Math/asinh.cpp          |   2 +-
 flang/runtime/Float128Math/atan.cpp           |   2 +-
 flang/runtime/Float128Math/atan2.cpp          |   2 +-
 flang/runtime/Float128Math/atanh.cpp          |   2 +-
 flang/runtime/Float128Math/cabs.cpp           |   2 +-
 flang/runtime/Float128Math/ceil.cpp           |   2 +-
 flang/runtime/Float128Math/cos.cpp            |   2 +-
 flang/runtime/Float128Math/cosh.cpp           |   2 +-
 flang/runtime/Float128Math/erf.cpp            |   2 +-
 flang/runtime/Float128Math/erfc.cpp           |   2 +-
 flang/runtime/Float128Math/exp.cpp            |   2 +-
 flang/runtime/Float128Math/exponent.cpp       |  26 ++
 flang/runtime/Float128Math/floor.cpp          |   2 +-
 flang/runtime/Float128Math/fraction.cpp       |  21 ++
 flang/runtime/Float128Math/hypot.cpp          |   2 +-
 flang/runtime/Float128Math/j0.cpp             |   2 +-
 flang/runtime/Float128Math/j1.cpp             |   2 +-
 flang/runtime/Float128Math/jn.cpp             |   2 +-
 flang/runtime/Float128Math/lgamma.cpp         |   2 +-
 flang/runtime/Float128Math/llround.cpp        |   2 +-
 flang/runtime/Float128Math/log.cpp            |   2 +-
 flang/runtime/Float128Math/log10.cpp          |   2 +-
 flang/runtime/Float128Math/lround.cpp         |   2 +-
 flang/runtime/Float128Math/math-entries.h     | 150 ++++----
 flang/runtime/Float128Math/mod-real.cpp       |  24 ++
 flang/runtime/Float128Math/modulo-real.cpp    |  24 ++
 flang/runtime/Float128Math/nearest.cpp        |  23 ++
 flang/runtime/Float128Math/norm2.cpp          |  34 +-
 .../Float128Math/numeric-template-specs.h     |  55 +++
 flang/runtime/Float128Math/pow.cpp            |   2 +-
 flang/runtime/Float128Math/round.cpp          |   2 +-
 flang/runtime/Float128Math/rrspacing.cpp      |  21 ++
 flang/runtime/Float128Math/scale.cpp          |  28 ++
 flang/runtime/Float128Math/set-exponent.cpp   |  23 ++
 flang/runtime/Float128Math/sin.cpp            |   2 +-
 flang/runtime/Float128Math/sinh.cpp           |   2 +-
 flang/runtime/Float128Math/spacing.cpp        |  21 ++
 flang/runtime/Float128Math/sqrt.cpp           |   6 +-
 flang/runtime/Float128Math/tan.cpp            |   2 +-
 flang/runtime/Float128Math/tanh.cpp           |   2 +-
 flang/runtime/Float128Math/tgamma.cpp         |   2 +-
 flang/runtime/Float128Math/trunc.cpp          |   2 +-
 flang/runtime/Float128Math/y0.cpp             |   2 +-
 flang/runtime/Float128Math/y1.cpp             |   2 +-
 flang/runtime/Float128Math/yn.cpp             |   2 +-
 flang/runtime/extrema.cpp                     |  85 +----
 flang/runtime/numeric-templates.h             | 339 ++++++++++++++++++
 flang/runtime/numeric.cpp                     | 211 +----------
 flang/runtime/reduction-templates.h           |  36 +-
 53 files changed, 762 insertions(+), 444 deletions(-)
 create mode 100644 flang/runtime/Float128Math/exponent.cpp
 create mode 100644 flang/runtime/Float128Math/fraction.cpp
 create mode 100644 flang/runtime/Float128Math/mod-real.cpp
 create mode 100644 flang/runtime/Float128Math/modulo-real.cpp
 create mode 100644 flang/runtime/Float128Math/nearest.cpp
 create mode 100644 flang/runtime/Float128Math/numeric-template-specs.h
 create mode 100644 flang/runtime/Float128Math/rrspacing.cpp
 create mode 100644 flang/runtime/Float128Math/scale.cpp
 create mode 100644 flang/runtime/Float128Math/set-exponent.cpp
 create mode 100644 flang/runtime/Float128Math/spacing.cpp
 create mode 100644 flang/runtime/numeric-templates.h

diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
index f11678cd70b769..60d44c78be0faf 100644
--- a/flang/runtime/Float128Math/CMakeLists.txt
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -59,7 +59,9 @@ set(sources
   erf.cpp
   erfc.cpp
   exp.cpp
+  exponent.cpp
   floor.cpp
+  fraction.cpp
   hypot.cpp
   j0.cpp
   j1.cpp
@@ -69,11 +71,18 @@ set(sources
   log.cpp
   log10.cpp
   lround.cpp
+  mod-real.cpp
+  modulo-real.cpp
+  nearest.cpp
   norm2.cpp
   pow.cpp
   round.cpp
+  rrspacing.cpp
+  scale.cpp
+  set-exponent.cpp
   sin.cpp
   sinh.cpp
+  spacing.cpp
   sqrt.cpp
   tan.cpp
   tanh.cpp
diff --git a/flang/runtime/Float128Math/acos.cpp b/flang/runtime/Float128Math/acos.cpp
index 531c79c7444bd3..14ff6944856844 100644
--- a/flang/runtime/Float128Math/acos.cpp
+++ b/flang/runtime/Float128Math/acos.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(AcosF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Acos<RTNAME(AcosF128)>::invoke(x);
+  return Acos<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/acosh.cpp b/flang/runtime/Float128Math/acosh.cpp
index 1495120edd1a07..9d70804e44a470 100644
--- a/flang/runtime/Float128Math/acosh.cpp
+++ b/flang/runtime/Float128Math/acosh.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(AcoshF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Acosh<RTNAME(AcoshF128)>::invoke(x);
+  return Acosh<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/asin.cpp b/flang/runtime/Float128Math/asin.cpp
index 2fb8c6c5e97d71..6781b23f0363db 100644
--- a/flang/runtime/Float128Math/asin.cpp
+++ b/flang/runtime/Float128Math/asin.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(AsinF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Asin<RTNAME(AsinF128)>::invoke(x);
+  return Asin<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/asinh.cpp b/flang/runtime/Float128Math/asinh.cpp
index 3630a77be42b2c..1310bc61c1de0f 100644
--- a/flang/runtime/Float128Math/asinh.cpp
+++ b/flang/runtime/Float128Math/asinh.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(AsinhF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Asinh<RTNAME(AsinhF128)>::invoke(x);
+  return Asinh<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/atan.cpp b/flang/runtime/Float128Math/atan.cpp
index 4609343e9d1273..f01382df90c0ee 100644
--- a/flang/runtime/Float128Math/atan.cpp
+++ b/flang/runtime/Float128Math/atan.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(AtanF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Atan<RTNAME(AtanF128)>::invoke(x);
+  return Atan<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/atan2.cpp b/flang/runtime/Float128Math/atan2.cpp
index c0175e67ec71bd..dd646b0452b115 100644
--- a/flang/runtime/Float128Math/atan2.cpp
+++ b/flang/runtime/Float128Math/atan2.cpp
@@ -15,7 +15,7 @@ extern "C" {
 CppTypeFor<TypeCategory::Real, 16> RTDEF(Atan2F128)(
     CppTypeFor<TypeCategory::Real, 16> x,
     CppTypeFor<TypeCategory::Real, 16> y) {
-  return Atan2<RTNAME(Atan2F128)>::invoke(x, y);
+  return Atan2<true>::invoke(x, y);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/atanh.cpp b/flang/runtime/Float128Math/atanh.cpp
index bfacb967117d70..5fc5ba5debc81a 100644
--- a/flang/runtime/Float128Math/atanh.cpp
+++ b/flang/runtime/Float128Math/atanh.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(AtanhF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Atanh<RTNAME(AtanhF128)>::invoke(x);
+  return Atanh<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/cabs.cpp b/flang/runtime/Float128Math/cabs.cpp
index 827b197a6a81ae..3b8c9d17003c6e 100644
--- a/flang/runtime/Float128Math/cabs.cpp
+++ b/flang/runtime/Float128Math/cabs.cpp
@@ -16,7 +16,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 // NOTE: Flang calls the runtime APIs using C _Complex ABI
 CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(CFloat128ComplexType x) {
-  return CAbs<RTNAME(CAbsF128)>::invoke(x);
+  return CAbs<true>::invoke(x);
 }
 #endif
 #endif
diff --git a/flang/runtime/Float128Math/ceil.cpp b/flang/runtime/Float128Math/ceil.cpp
index a53a2c27c616b5..ed4d164a62bedc 100644
--- a/flang/runtime/Float128Math/ceil.cpp
+++ b/flang/runtime/Float128Math/ceil.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(CeilF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Ceil<RTNAME(CeilF128)>::invoke(x);
+  return Ceil<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/cos.cpp b/flang/runtime/Float128Math/cos.cpp
index 845c970bd8e639..b93c92f275f791 100644
--- a/flang/runtime/Float128Math/cos.cpp
+++ b/flang/runtime/Float128Math/cos.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(CosF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Cos<RTNAME(CosF128)>::invoke(x);
+  return Cos<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/cosh.cpp b/flang/runtime/Float128Math/cosh.cpp
index acf6ff4130ee3c..a3662a826dcb1c 100644
--- a/flang/runtime/Float128Math/cosh.cpp
+++ b/flang/runtime/Float128Math/cosh.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(CoshF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Cosh<RTNAME(CoshF128)>::invoke(x);
+  return Cosh<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/erf.cpp b/flang/runtime/Float128Math/erf.cpp
index 862f3b97411873..631f71c76effe7 100644
--- a/flang/runtime/Float128Math/erf.cpp
+++ b/flang/runtime/Float128Math/erf.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(ErfF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Erf<RTNAME(ErfF128)>::invoke(x);
+  return Erf<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/erfc.cpp b/flang/runtime/Float128Math/erfc.cpp
index 0ac0b945563747..ea3cd646d8c4ba 100644
--- a/flang/runtime/Float128Math/erfc.cpp
+++ b/flang/runtime/Float128Math/erfc.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(ErfcF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Erfc<RTNAME(ErfcF128)>::invoke(x);
+  return Erfc<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/exp.cpp b/flang/runtime/Float128Math/exp.cpp
index 50386fdbfb6449..b1161b0f29294c 100644
--- a/flang/runtime/Float128Math/exp.cpp
+++ b/flang/runtime/Float128Math/exp.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(ExpF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Exp<RTNAME(ExpF128)>::invoke(x);
+  return Exp<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/exponent.cpp b/flang/runtime/Float128Math/exponent.cpp
new file mode 100644
index 00000000000000..1be1dd0d0ac8b8
--- /dev/null
+++ b/flang/runtime/Float128Math/exponent.cpp
@@ -0,0 +1,26 @@
+//===-- runtime/Float128Math/exponent.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "numeric-template-specs.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+// EXPONENT (16.9.75)
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Exponent16_4)(F128Type x) {
+  return Exponent<CppTypeFor<TypeCategory::Integer, 4>>(x);
+}
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Exponent16_8)(F128Type x) {
+  return Exponent<CppTypeFor<TypeCategory::Integer, 8>>(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/floor.cpp b/flang/runtime/Float128Math/floor.cpp
index 48cf4e01448070..78a94984cac8a3 100644
--- a/flang/runtime/Float128Math/floor.cpp
+++ b/flang/runtime/Float128Math/floor.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(FloorF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Floor<RTNAME(FloorF128)>::invoke(x);
+  return Floor<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/fraction.cpp b/flang/runtime/Float128Math/fraction.cpp
new file mode 100644
index 00000000000000..8c9889b7f6871e
--- /dev/null
+++ b/flang/runtime/Float128Math/fraction.cpp
@@ -0,0 +1,21 @@
+//===-- runtime/Float128Math/fraction.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "numeric-template-specs.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+// FRACTION (16.9.80)
+F128Type RTDEF(Fraction16)(F128Type x) { return Fraction(x); }
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/hypot.cpp b/flang/runtime/Float128Math/hypot.cpp
index 33c83a1654993e..b4fa1d66bcfa6a 100644
--- a/flang/runtime/Float128Math/hypot.cpp
+++ b/flang/runtime/Float128Math/hypot.cpp
@@ -15,7 +15,7 @@ extern "C" {
 CppTypeFor<TypeCategory::Real, 16> RTDEF(HypotF128)(
     CppTypeFor<TypeCategory::Real, 16> x,
     CppTypeFor<TypeCategory::Real, 16> y) {
-  return Hypot<RTNAME(HypotF128)>::invoke(x, y);
+  return Hypot<true>::invoke(x, y);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/j0.cpp b/flang/runtime/Float128Math/j0.cpp
index f8f3fe71d8a616..9390a7eeb3c605 100644
--- a/flang/runtime/Float128Math/j0.cpp
+++ b/flang/runtime/Float128Math/j0.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(J0F128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return J0<RTNAME(J0F128)>::invoke(x);
+  return J0<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/j1.cpp b/flang/runtime/Float128Math/j1.cpp
index 9a51b973e1cf88..c54927123388c6 100644
--- a/flang/runtime/Float128Math/j1.cpp
+++ b/flang/runtime/Float128Math/j1.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(J1F128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return J1<RTNAME(J1F128)>::invoke(x);
+  return J1<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/jn.cpp b/flang/runtime/Float128Math/jn.cpp
index 644a66863c0d23..15afd83400c320 100644
--- a/flang/runtime/Float128Math/jn.cpp
+++ b/flang/runtime/Float128Math/jn.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(JnF128)(
     int n, CppTypeFor<TypeCategory::Real, 16> x) {
-  return Jn<RTNAME(JnF128)>::invoke(n, x);
+  return Jn<true>::invoke(n, x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/lgamma.cpp b/flang/runtime/Float128Math/lgamma.cpp
index fff7dfcb9c15db..ac31c89a912b32 100644
--- a/flang/runtime/Float128Math/lgamma.cpp
+++ b/flang/runtime/Float128Math/lgamma.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(LgammaF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Lgamma<RTNAME(LgammaF128)>::invoke(x);
+  return Lgamma<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/llround.cpp b/flang/runtime/Float128Math/llround.cpp
index 00c62818af19db..b77281c507fe7c 100644
--- a/flang/runtime/Float128Math/llround.cpp
+++ b/flang/runtime/Float128Math/llround.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Integer, 8> RTDEF(LlroundF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Llround<RTNAME(LlroundF128)>::invoke(x);
+  return Llround<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/log.cpp b/flang/runtime/Float128Math/log.cpp
index 0cfe329c6f7f59..38e6b581fd849c 100644
--- a/flang/runtime/Float128Math/log.cpp
+++ b/flang/runtime/Float128Math/log.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(LogF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Log<RTNAME(LogF128)>::invoke(x);
+  return Log<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/log10.cpp b/flang/runtime/Float128Math/log10.cpp
index cd8bf27fcb121b..3c89c0e707774f 100644
--- a/flang/runtime/Float128Math/log10.cpp
+++ b/flang/runtime/Float128Math/log10.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(Log10F128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Log10<RTNAME(Log10F128)>::invoke(x);
+  return Log10<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/lround.cpp b/flang/runtime/Float128Math/lround.cpp
index 6ced66a1b2d3af..ce7a228038a1d3 100644
--- a/flang/runtime/Float128Math/lround.cpp
+++ b/flang/runtime/Float128Math/lround.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Integer, 4> RTDEF(LroundF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Lround<RTNAME(LroundF128)>::invoke(x);
+  return Lround<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index a0d81d0cbb5407..ad3f6aa18aa9a1 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -13,36 +13,40 @@
 #include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
 #include <cfloat>
+#include <cmath>
 #include <type_traits>
 
+namespace {
+using namespace Fortran::runtime;
+using F128RetType = CppTypeFor<TypeCategory::Real, 16>;
+using I32RetType = CppTypeFor<TypeCategory::Integer, 4>;
+using I64RetType = CppTypeFor<TypeCategory::Integer, 8>;
+} // namespace
+
 namespace Fortran::runtime {
 
 // Define a class template to gracefully fail, when
 // there is no specialized template that implements
 // the required function via using the third-party
 // implementation.
-#define DEFINE_FALLBACK(caller) \
-  template <auto F> struct caller { \
-    template <typename... ATs> \
-    [[noreturn]] static std::invoke_result_t<decltype(F), ATs...> invoke( \
-        ATs... args) { \
+#define DEFINE_FALLBACK(caller, ret_type) \
+  template <bool = false, typename RT = ret_type> struct caller { \
+    template <typename... ATs> [[noreturn]] static RT invoke(ATs... args) { \
       Terminator terminator{__FILE__, __LINE__}; \
       terminator.Crash("Float128 variant of '%s' is unsupported", #caller); \
     } \
   };
 
 // Define template specialization that is calling the third-party
-// implementation. The template is specialized by a function pointer
-// that is the FortranFloat128Math entry point. The signatures
-// of the caller and the callee must match.
+// implementation.
 //
 // Defining the specialization for any target library requires
 // adding the generic template via DEFINE_FALLBACK, so that
 // a build with another target library that does not define
 // the same alias can gracefully fail in runtime.
 #define DEFINE_SIMPLE_ALIAS(caller, callee) \
-  template <typename RT, typename... ATs, RT (*p)(ATs...)> struct caller<p> { \
-    static RT invoke(ATs... args) { \
+  template <typename RT> struct caller<true, RT> { \
+    template <typename... ATs> static RT invoke(ATs... args) { \
       static_assert(std::is_invocable_r_v<RT, \
           decltype(callee(std::declval<ATs>()...))(ATs...), ATs...>); \
       if constexpr (std::is_same_v<RT, void>) { \
@@ -54,48 +58,58 @@ namespace Fortran::runtime {
   };
 
 // Define fallback callers.
-DEFINE_FALLBACK(Abs)
-DEFINE_FALLBACK(Acos)
-DEFINE_FALLBACK(Acosh)
-DEFINE_FALLBACK(Asin)
-DEFINE_FALLBACK(Asinh)
-DEFINE_FALLBACK(Atan)
-DEFINE_FALLBACK(Atan2)
-DEFINE_FALLBACK(Atanh)
-DEFINE_FALLBACK(Ceil)
-DEFINE_FALLBACK(Cos)
-DEFINE_FALLBACK(Cosh)
-DEFINE_FALLBACK(Erf)
-DEFINE_FALLBACK(Erfc)
-DEFINE_FALLBACK(Exp)
-DEFINE_FALLBACK(Floor)
-DEFINE_FALLBACK(Hypot)
-DEFINE_FALLBACK(J0)
-DEFINE_FALLBACK(J1)
-DEFINE_FALLBACK(Jn)
-DEFINE_FALLBACK(Lgamma)
-DEFINE_FALLBACK(Llround)
-DEFINE_FALLBACK(Lround)
-DEFINE_FALLBACK(Log)
-DEFINE_FALLBACK(Log10)
-DEFINE_FALLBACK(Pow)
-DEFINE_FALLBACK(Round)
-DEFINE_FALLBACK(Sin)
-DEFINE_FALLBACK(Sinh)
-DEFINE_FALLBACK(Sqrt)
-DEFINE_FALLBACK(Tan)
-DEFINE_FALLBACK(Tanh)
-DEFINE_FALLBACK(Tgamma)
-DEFINE_FALLBACK(Trunc)
-DEFINE_FALLBACK(Y0)
-DEFINE_FALLBACK(Y1)
-DEFINE_FALLBACK(Yn)
+#define DEFINE_FALLBACK_F128(caller) DEFINE_FALLBACK(caller, ::F128RetType)
+#define DEFINE_FALLBACK_I32(caller) DEFINE_FALLBACK(caller, ::I32RetType)
+#define DEFINE_FALLBACK_I64(caller) DEFINE_FALLBACK(caller, ::I64RetType)
+
+DEFINE_FALLBACK_F128(Abs)
+DEFINE_FALLBACK_F128(Acos)
+DEFINE_FALLBACK_F128(Acosh)
+DEFINE_FALLBACK_F128(Asin)
+DEFINE_FALLBACK_F128(Asinh)
+DEFINE_FALLBACK_F128(Atan)
+DEFINE_FALLBACK_F128(Atan2)
+DEFINE_FALLBACK_F128(Atanh)
+DEFINE_FALLBACK_F128(Ceil)
+DEFINE_FALLBACK_F128(Cos)
+DEFINE_FALLBACK_F128(Cosh)
+DEFINE_FALLBACK_F128(Erf)
+DEFINE_FALLBACK_F128(Erfc)
+DEFINE_FALLBACK_F128(Exp)
+DEFINE_FALLBACK_F128(Floor)
+DEFINE_FALLBACK_F128(Frexp)
+DEFINE_FALLBACK_F128(Hypot)
+DEFINE_FALLBACK_I32(Ilogb)
+DEFINE_FALLBACK_I32(Isinf)
+DEFINE_FALLBACK_I32(Isnan)
+DEFINE_FALLBACK_F128(J0)
+DEFINE_FALLBACK_F128(J1)
+DEFINE_FALLBACK_F128(Jn)
+DEFINE_FALLBACK_F128(Ldexp)
+DEFINE_FALLBACK_F128(Lgamma)
+DEFINE_FALLBACK_I64(Llround)
+DEFINE_FALLBACK_F128(Log)
+DEFINE_FALLBACK_F128(Log10)
+DEFINE_FALLBACK_I32(Lround)
+DEFINE_FALLBACK_F128(Nextafter)
+DEFINE_FALLBACK_F128(Pow)
+DEFINE_FALLBACK_F128(Qnan)
+DEFINE_FALLBACK_F128(Round)
+DEFINE_FALLBACK_F128(Sin)
+DEFINE_FALLBACK_F128(Sinh)
+DEFINE_FALLBACK_F128(Sqrt)
+DEFINE_FALLBACK_F128(Tan)
+DEFINE_FALLBACK_F128(Tanh)
+DEFINE_FALLBACK_F128(Tgamma)
+DEFINE_FALLBACK_F128(Trunc)
+DEFINE_FALLBACK_F128(Y0)
+DEFINE_FALLBACK_F128(Y1)
+DEFINE_FALLBACK_F128(Yn)
 
 #if HAS_LIBM
-// Define wrapper callers for libm.
-#include <ccomplex>
-#include <cmath>
+#include <limits>
 
+// Define wrapper callers for libm.
 #if LDBL_MANT_DIG == 113
 // Use STD math functions. They provide IEEE-754 128-bit float
 // support either via 'long double' or __float128.
@@ -118,15 +132,21 @@ DEFINE_SIMPLE_ALIAS(Erf, std::erf)
 DEFINE_SIMPLE_ALIAS(Erfc, std::erfc)
 DEFINE_SIMPLE_ALIAS(Exp, std::exp)
 DEFINE_SIMPLE_ALIAS(Floor, std::floor)
+DEFINE_SIMPLE_ALIAS(Frexp, std::frexp)
 DEFINE_SIMPLE_ALIAS(Hypot, std::hypot)
+DEFINE_SIMPLE_ALIAS(Ilogb, std::ilogb)
+DEFINE_SIMPLE_ALIAS(Isinf, std::isinf)
+DEFINE_SIMPLE_ALIAS(Isnan, std::isnan)
 DEFINE_SIMPLE_ALIAS(J0, j0l)
 DEFINE_SIMPLE_ALIAS(J1, j1l)
 DEFINE_SIMPLE_ALIAS(Jn, jnl)
+DEFINE_SIMPLE_ALIAS(Ldexp, std::ldexp)
 DEFINE_SIMPLE_ALIAS(Lgamma, std::lgamma)
 DEFINE_SIMPLE_ALIAS(Llround, std::llround)
-DEFINE_SIMPLE_ALIAS(Lround, std::lround)
 DEFINE_SIMPLE_ALIAS(Log, std::log)
 DEFINE_SIMPLE_ALIAS(Log10, std::log10)
+DEFINE_SIMPLE_ALIAS(Lround, std::lround)
+DEFINE_SIMPLE_ALIAS(Nextafter, std::nextafter)
 DEFINE_SIMPLE_ALIAS(Pow, std::pow)
 DEFINE_SIMPLE_ALIAS(Round, std::round)
 DEFINE_SIMPLE_ALIAS(Sin, std::sin)
@@ -139,6 +159,12 @@ DEFINE_SIMPLE_ALIAS(Trunc, std::trunc)
 DEFINE_SIMPLE_ALIAS(Y0, y0l)
 DEFINE_SIMPLE_ALIAS(Y1, y1l)
 DEFINE_SIMPLE_ALIAS(Yn, ynl)
+
+// Use numeric_limits to produce infinity of the right type.
+#define F128_RT_INFINITY \
+  (std::numeric_limits<CppTypeFor<TypeCategory::Real, 16>>::infinity())
+#define F128_RT_QNAN \
+  (std::numeric_limits<CppTypeFor<TypeCategory::Real, 16>>::quiet_NaN())
 #else // LDBL_MANT_DIG != 113
 #if !HAS_LIBMF128
 // glibc >=2.26 seems to have complete support for __float128
@@ -172,15 +198,21 @@ DEFINE_SIMPLE_ALIAS(Erf, erfq)
 DEFINE_SIMPLE_ALIAS(Erfc, erfcq)
 DEFINE_SIMPLE_ALIAS(Exp, expq)
 DEFINE_SIMPLE_ALIAS(Floor, floorq)
+DEFINE_SIMPLE_ALIAS(Frexp, frexpq)
 DEFINE_SIMPLE_ALIAS(Hypot, hypotq)
+DEFINE_SIMPLE_ALIAS(Ilogb, ilogbq)
+DEFINE_SIMPLE_ALIAS(Isinf, isinfq)
+DEFINE_SIMPLE_ALIAS(Isnan, isnanq)
 DEFINE_SIMPLE_ALIAS(J0, j0q)
 DEFINE_SIMPLE_ALIAS(J1, j1q)
 DEFINE_SIMPLE_ALIAS(Jn, jnq)
+DEFINE_SIMPLE_ALIAS(Ldexp, ldexpq)
 DEFINE_SIMPLE_ALIAS(Lgamma, lgammaq)
 DEFINE_SIMPLE_ALIAS(Llround, llroundq)
-DEFINE_SIMPLE_ALIAS(Lround, lroundq)
 DEFINE_SIMPLE_ALIAS(Log, logq)
 DEFINE_SIMPLE_ALIAS(Log10, log10q)
+DEFINE_SIMPLE_ALIAS(Lround, lroundq)
+DEFINE_SIMPLE_ALIAS(Nextafter, nextafterq)
 DEFINE_SIMPLE_ALIAS(Pow, powq)
 DEFINE_SIMPLE_ALIAS(Round, roundq)
 DEFINE_SIMPLE_ALIAS(Sin, sinq)
@@ -193,19 +225,11 @@ DEFINE_SIMPLE_ALIAS(Trunc, truncq)
 DEFINE_SIMPLE_ALIAS(Y0, y0q)
 DEFINE_SIMPLE_ALIAS(Y1, y1q)
 DEFINE_SIMPLE_ALIAS(Yn, ynq)
-#endif
 
-extern "C" {
-// Declarations of the entry points that might be referenced
-// within the Float128Math library itself.
-// Note that not all of these entry points are actually
-// defined in this library. Some of them are used just
-// as template parameters to call the corresponding callee directly.
-CppTypeFor<TypeCategory::Real, 16> RTDECL(AbsF128)(
-    CppTypeFor<TypeCategory::Real, 16> x);
-CppTypeFor<TypeCategory::Real, 16> RTDECL(SqrtF128)(
-    CppTypeFor<TypeCategory::Real, 16> x);
-} // extern "C"
+// Use cmath INFINITY/NAN definition. Rely on C implicit conversions.
+#define F128_RT_INFINITY (INFINITY)
+#define F128_RT_QNAN (NAN)
+#endif
 
 } // namespace Fortran::runtime
 
diff --git a/flang/runtime/Float128Math/mod-real.cpp b/flang/runtime/Float128Math/mod-real.cpp
new file mode 100644
index 00000000000000..42e6ce76e2fa1b
--- /dev/null
+++ b/flang/runtime/Float128Math/mod-real.cpp
@@ -0,0 +1,24 @@
+//===-- runtime/Float128Math/mod-real.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "numeric-template-specs.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+// MOD (16.9.135)
+F128Type RTDEF(ModReal16)(
+    F128Type x, F128Type p, const char *sourceFile, int sourceLine) {
+  return RealMod<false>(x, p, sourceFile, sourceLine);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/modulo-real.cpp b/flang/runtime/Float128Math/modulo-real.cpp
new file mode 100644
index 00000000000000..13000aba8c8323
--- /dev/null
+++ b/flang/runtime/Float128Math/modulo-real.cpp
@@ -0,0 +1,24 @@
+//===-- runtime/Float128Math/modulo-real.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "numeric-template-specs.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+// MODULO (16.9.136)
+F128Type RTDEF(ModuloReal16)(
+    F128Type x, F128Type p, const char *sourceFile, int sourceLine) {
+  return RealMod<true>(x, p, sourceFile, sourceLine);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/nearest.cpp b/flang/runtime/Float128Math/nearest.cpp
new file mode 100644
index 00000000000000..148ac4ef839160
--- /dev/null
+++ b/flang/runtime/Float128Math/nearest.cpp
@@ -0,0 +1,23 @@
+//===-- runtime/Float128Math/nearest.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Nearest16)(
+    CppTypeFor<TypeCategory::Real, 16> x, bool positive) {
+  return Nextafter<true>::invoke(
+      x, positive ? F128_RT_INFINITY : -F128_RT_INFINITY);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/norm2.cpp b/flang/runtime/Float128Math/norm2.cpp
index 17453bd2d6cbd7..15c482f7f007ce 100644
--- a/flang/runtime/Float128Math/norm2.cpp
+++ b/flang/runtime/Float128Math/norm2.cpp
@@ -7,39 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "math-entries.h"
+#include "numeric-template-specs.h"
 #include "reduction-templates.h"
-#include <cmath>
-
-#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-
-namespace {
-using namespace Fortran::runtime;
-
-using AccumType = Norm2AccumType<16>;
-
-struct ABSTy {
-  static AccumType compute(AccumType x) {
-    return Sqrt<RTNAME(AbsF128)>::invoke(x);
-  }
-};
-
-struct SQRTTy {
-  static AccumType compute(AccumType x) {
-    return Sqrt<RTNAME(SqrtF128)>::invoke(x);
-  }
-};
-
-using Float128Norm2Accumulator = Norm2Accumulator<16, ABSTy, SQRTTy>;
-} // namespace
 
 namespace Fortran::runtime {
 extern "C" {
 
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(Norm2_16)(
     const Descriptor &x, const char *source, int line, int dim) {
-  auto accumulator{::Float128Norm2Accumulator(x)};
   return GetTotalReduction<TypeCategory::Real, 16>(
-      x, source, line, dim, nullptr, accumulator, "NORM2");
+      x, source, line, dim, nullptr, Norm2Accumulator<16>{x}, "NORM2");
 }
 
 void RTDEF(Norm2DimReal16)(Descriptor &result, const Descriptor &x, int dim,
@@ -49,11 +27,9 @@ void RTDEF(Norm2DimReal16)(Descriptor &result, const Descriptor &x, int dim,
   RUNTIME_CHECK(terminator, type);
   RUNTIME_CHECK(
       terminator, type->first == TypeCategory::Real && type->second == 16);
-  DoMaxMinNorm2<TypeCategory::Real, 16, ::Float128Norm2Accumulator>(
-      result, x, dim, nullptr, "NORM2", terminator);
+  Norm2Helper<16>{}(result, x, dim, nullptr, terminator);
 }
+#endif
 
 } // extern "C"
 } // namespace Fortran::runtime
-
-#endif
diff --git a/flang/runtime/Float128Math/numeric-template-specs.h b/flang/runtime/Float128Math/numeric-template-specs.h
new file mode 100644
index 00000000000000..a0a77230c3e9eb
--- /dev/null
+++ b/flang/runtime/Float128Math/numeric-template-specs.h
@@ -0,0 +1,55 @@
+//===-- runtime/Float128Math/numeric-template-specs.h -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_FLOAT128MATH_NUMERIC_TEMPLATE_SPECS_H_
+#define FORTRAN_RUNTIME_FLOAT128MATH_NUMERIC_TEMPLATE_SPECS_H_
+
+#include "math-entries.h"
+#include "numeric-templates.h"
+
+namespace Fortran::runtime {
+using F128Type = CppTypeFor<TypeCategory::Real, 16>;
+
+template <> struct ABSTy<F128Type> {
+  static F128Type compute(F128Type x) { return Abs<true>::invoke(x); }
+};
+
+template <> struct FREXPTy<F128Type> {
+  static F128Type compute(F128Type x, int *e) {
+    return Frexp<true>::invoke(x, e);
+  }
+};
+
+template <> struct ILOGBTy<F128Type> {
+  static int compute(F128Type x) { return Ilogb<true>::invoke(x); }
+};
+
+template <> struct ISINFTy<F128Type> {
+  static bool compute(F128Type x) { return Isinf<true>::invoke(x); }
+};
+
+template <> struct ISNANTy<F128Type> {
+  static bool compute(F128Type x) { return Isnan<true>::invoke(x); }
+};
+
+template <> struct LDEXPTy<F128Type> {
+  template <typename ET> static F128Type compute(F128Type x, ET p) {
+    return Ldexp<true>::invoke(x, p);
+  }
+};
+
+template <> struct QNANTy<F128Type> {
+  static F128Type compute() { return F128_RT_QNAN; }
+};
+
+template <> struct SQRTTy<F128Type> {
+  static F128Type compute(F128Type x) { return Sqrt<true>::invoke(x); }
+};
+
+} // namespace Fortran::runtime
+#endif // FORTRAN_RUNTIME_FLOAT128MATH_NUMERIC_TEMPLATE_SPECS_H_
diff --git a/flang/runtime/Float128Math/pow.cpp b/flang/runtime/Float128Math/pow.cpp
index 02958a890e5221..7a48828ee3e765 100644
--- a/flang/runtime/Float128Math/pow.cpp
+++ b/flang/runtime/Float128Math/pow.cpp
@@ -15,7 +15,7 @@ extern "C" {
 CppTypeFor<TypeCategory::Real, 16> RTDEF(PowF128)(
     CppTypeFor<TypeCategory::Real, 16> x,
     CppTypeFor<TypeCategory::Real, 16> y) {
-  return Pow<RTNAME(PowF128)>::invoke(x, y);
+  return Pow<true>::invoke(x, y);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/round.cpp b/flang/runtime/Float128Math/round.cpp
index 43ab57768cb77a..6420c1bc9cd25d 100644
--- a/flang/runtime/Float128Math/round.cpp
+++ b/flang/runtime/Float128Math/round.cpp
@@ -18,7 +18,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(RoundF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Round<RTNAME(RoundF128)>::invoke(x);
+  return Round<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/rrspacing.cpp b/flang/runtime/Float128Math/rrspacing.cpp
new file mode 100644
index 00000000000000..feddac418eec39
--- /dev/null
+++ b/flang/runtime/Float128Math/rrspacing.cpp
@@ -0,0 +1,21 @@
+//===-- runtime/Float128Math/rrspacing.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "numeric-template-specs.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+// FRACTION (16.9.80)
+F128Type RTDEF(RRSpacing16)(F128Type x) { return RRSpacing<113>(x); }
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/scale.cpp b/flang/runtime/Float128Math/scale.cpp
new file mode 100644
index 00000000000000..0be958bd9f2a72
--- /dev/null
+++ b/flang/runtime/Float128Math/scale.cpp
@@ -0,0 +1,28 @@
+//===-- runtime/Float128Math/scale.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "numeric-template-specs.h"
+#include <limits>
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+F128Type RTDEF(Scale16)(F128Type x, std::int64_t p) {
+  auto ip{static_cast<int>(p)};
+  if (ip != p) {
+    ip = p < 0 ? std::numeric_limits<int>::min()
+               : std::numeric_limits<int>::max();
+  }
+  return LDEXPTy<F128Type>::compute(x, ip);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/set-exponent.cpp b/flang/runtime/Float128Math/set-exponent.cpp
new file mode 100644
index 00000000000000..99c34af7962b9a
--- /dev/null
+++ b/flang/runtime/Float128Math/set-exponent.cpp
@@ -0,0 +1,23 @@
+//===-- runtime/Float128Math/set-exponent.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "numeric-template-specs.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+// SET_EXPONENT (16.9.171)
+F128Type RTDEF(SetExponent16)(F128Type x, std::int64_t p) {
+  return SetExponent(x, p);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/sin.cpp b/flang/runtime/Float128Math/sin.cpp
index 013eb9d119a6a3..8ebc3f9881586e 100644
--- a/flang/runtime/Float128Math/sin.cpp
+++ b/flang/runtime/Float128Math/sin.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(SinF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Sin<RTNAME(SinF128)>::invoke(x);
+  return Sin<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/sinh.cpp b/flang/runtime/Float128Math/sinh.cpp
index 9c907041fd7eb4..aa716a3e51ef5a 100644
--- a/flang/runtime/Float128Math/sinh.cpp
+++ b/flang/runtime/Float128Math/sinh.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(SinhF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Sinh<RTNAME(SinhF128)>::invoke(x);
+  return Sinh<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/spacing.cpp b/flang/runtime/Float128Math/spacing.cpp
new file mode 100644
index 00000000000000..a86c0b30e567ab
--- /dev/null
+++ b/flang/runtime/Float128Math/spacing.cpp
@@ -0,0 +1,21 @@
+//===-- runtime/Float128Math/spacing.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "numeric-template-specs.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+// SPACING (16.9.180)
+F128Type RTDEF(Spacing16)(F128Type x) { return Spacing<113>(x); }
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/sqrt.cpp b/flang/runtime/Float128Math/sqrt.cpp
index aafbd850ca973a..83165a4c623191 100644
--- a/flang/runtime/Float128Math/sqrt.cpp
+++ b/flang/runtime/Float128Math/sqrt.cpp
@@ -7,15 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "math-entries.h"
+#include "numeric-template-specs.h"
 
 namespace Fortran::runtime {
 extern "C" {
 
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTDEF(SqrtF128)(
-    CppTypeFor<TypeCategory::Real, 16> x) {
-  return Sqrt<RTNAME(SqrtF128)>::invoke(x);
-}
+F128Type RTDEF(SqrtF128)(F128Type x) { return SQRTTy<F128Type>::compute(x); }
 #endif
 
 } // extern "C"
diff --git a/flang/runtime/Float128Math/tan.cpp b/flang/runtime/Float128Math/tan.cpp
index 01d3c7bdd2e85d..8f4b723ca977bd 100644
--- a/flang/runtime/Float128Math/tan.cpp
+++ b/flang/runtime/Float128Math/tan.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(TanF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Tan<RTNAME(TanF128)>::invoke(x);
+  return Tan<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/tanh.cpp b/flang/runtime/Float128Math/tanh.cpp
index fedc1a4120caf5..b43a89520b6797 100644
--- a/flang/runtime/Float128Math/tanh.cpp
+++ b/flang/runtime/Float128Math/tanh.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(TanhF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Tanh<RTNAME(TanhF128)>::invoke(x);
+  return Tanh<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/tgamma.cpp b/flang/runtime/Float128Math/tgamma.cpp
index 329defff38cf91..93f97800bdc966 100644
--- a/flang/runtime/Float128Math/tgamma.cpp
+++ b/flang/runtime/Float128Math/tgamma.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(TgammaF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Tgamma<RTNAME(TgammaF128)>::invoke(x);
+  return Tgamma<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/trunc.cpp b/flang/runtime/Float128Math/trunc.cpp
index 3cab219ce31c2d..ca15a739c030e8 100644
--- a/flang/runtime/Float128Math/trunc.cpp
+++ b/flang/runtime/Float128Math/trunc.cpp
@@ -18,7 +18,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(TruncF128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Trunc<RTNAME(TruncF128)>::invoke(x);
+  return Trunc<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/y0.cpp b/flang/runtime/Float128Math/y0.cpp
index f3e2ee454aeab5..d6f39aac1053a8 100644
--- a/flang/runtime/Float128Math/y0.cpp
+++ b/flang/runtime/Float128Math/y0.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(Y0F128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Y0<RTNAME(Y0F128)>::invoke(x);
+  return Y0<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/y1.cpp b/flang/runtime/Float128Math/y1.cpp
index c117bbcb2b5a86..477d36a9ea3c66 100644
--- a/flang/runtime/Float128Math/y1.cpp
+++ b/flang/runtime/Float128Math/y1.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(Y1F128)(
     CppTypeFor<TypeCategory::Real, 16> x) {
-  return Y1<RTNAME(Y1F128)>::invoke(x);
+  return Y1<true>::invoke(x);
 }
 #endif
 
diff --git a/flang/runtime/Float128Math/yn.cpp b/flang/runtime/Float128Math/yn.cpp
index 237bc2866a0d5b..3a040cc8858970 100644
--- a/flang/runtime/Float128Math/yn.cpp
+++ b/flang/runtime/Float128Math/yn.cpp
@@ -14,7 +14,7 @@ extern "C" {
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(YnF128)(
     int n, CppTypeFor<TypeCategory::Real, 16> x) {
-  return Yn<RTNAME(YnF128)>::invoke(n, x);
+  return Yn<true>::invoke(n, x);
 }
 #endif
 
diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp
index fc2b4e165cb269..61afb0458430db 100644
--- a/flang/runtime/extrema.cpp
+++ b/flang/runtime/extrema.cpp
@@ -424,62 +424,6 @@ RT_EXT_API_GROUP_END
 
 // MAXVAL and MINVAL
 
-template <TypeCategory CAT, int KIND, bool IS_MAXVAL, typename Enable = void>
-struct MaxOrMinIdentity {
-  using Type = CppTypeFor<CAT, KIND>;
-  static constexpr RT_API_ATTRS Type Value() {
-    return IS_MAXVAL ? std::numeric_limits<Type>::lowest()
-                     : std::numeric_limits<Type>::max();
-  }
-};
-
-// std::numeric_limits<> may not know int128_t
-template <bool IS_MAXVAL>
-struct MaxOrMinIdentity<TypeCategory::Integer, 16, IS_MAXVAL> {
-  using Type = CppTypeFor<TypeCategory::Integer, 16>;
-  static constexpr RT_API_ATTRS Type Value() {
-    return IS_MAXVAL ? Type{1} << 127 : ~Type{0} >> 1;
-  }
-};
-
-#if HAS_FLOAT128
-// std::numeric_limits<> may not support __float128.
-//
-// Usage of GCC quadmath.h's FLT128_MAX is complicated by the fact that
-// even GCC complains about 'Q' literal suffix under -Wpedantic.
-// We just recreate FLT128_MAX ourselves.
-//
-// This specialization must engage only when
-// CppTypeFor<TypeCategory::Real, 16> is __float128.
-template <bool IS_MAXVAL>
-struct MaxOrMinIdentity<TypeCategory::Real, 16, IS_MAXVAL,
-    typename std::enable_if_t<
-        std::is_same_v<CppTypeFor<TypeCategory::Real, 16>, __float128>>> {
-  using Type = __float128;
-  static RT_API_ATTRS Type Value() {
-    // Create a buffer to store binary representation of __float128 constant.
-    constexpr std::size_t alignment =
-        std::max(alignof(Type), alignof(std::uint64_t));
-    alignas(alignment) char data[sizeof(Type)];
-
-    // First, verify that our interpretation of __float128 format is correct,
-    // e.g. by checking at least one known constant.
-    *reinterpret_cast<Type *>(data) = Type(1.0);
-    if (*reinterpret_cast<std::uint64_t *>(data) != 0 ||
-        *(reinterpret_cast<std::uint64_t *>(data) + 1) != 0x3FFF000000000000) {
-      Terminator terminator{__FILE__, __LINE__};
-      terminator.Crash("not yet implemented: no full support for __float128");
-    }
-
-    // Recreate FLT128_MAX.
-    *reinterpret_cast<std::uint64_t *>(data) = 0xFFFFFFFFFFFFFFFF;
-    *(reinterpret_cast<std::uint64_t *>(data) + 1) = 0x7FFEFFFFFFFFFFFF;
-    Type max = *reinterpret_cast<Type *>(data);
-    return IS_MAXVAL ? -max : max;
-  }
-};
-#endif // HAS_FLOAT128
-
 template <TypeCategory CAT, int KIND, bool IS_MAXVAL>
 class NumericExtremumAccumulator {
 public:
@@ -773,42 +717,25 @@ RT_EXT_API_GROUP_END
 
 // NORM2
 
-template <int KIND> struct Norm2Helper {
-  RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, int dim,
-      const Descriptor *mask, Terminator &terminator) const {
-    DoMaxMinNorm2<TypeCategory::Real, KIND,
-        typename Norm2AccumulatorGetter<KIND>::Type>(
-        result, x, dim, mask, "NORM2", terminator);
-  }
-};
-
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
 // TODO: REAL(2 & 3)
 CppTypeFor<TypeCategory::Real, 4> RTDEF(Norm2_4)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 4>(x, source, line, dim, nullptr,
-      Norm2AccumulatorGetter<4>::create(x), "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 4>(
+      x, source, line, dim, nullptr, Norm2Accumulator<4>{x}, "NORM2");
 }
 CppTypeFor<TypeCategory::Real, 8> RTDEF(Norm2_8)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 8>(x, source, line, dim, nullptr,
-      Norm2AccumulatorGetter<8>::create(x), "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 8>(
+      x, source, line, dim, nullptr, Norm2Accumulator<8>{x}, "NORM2");
 }
 #if LDBL_MANT_DIG == 64
 CppTypeFor<TypeCategory::Real, 10> RTDEF(Norm2_10)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 10>(x, source, line, dim,
-      nullptr, Norm2AccumulatorGetter<10>::create(x), "NORM2");
-}
-#endif
-#if LDBL_MANT_DIG == 113
-// The __float128 implementation resides in FortranFloat128Math library.
-CppTypeFor<TypeCategory::Real, 16> RTDEF(Norm2_16)(
-    const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 16>(x, source, line, dim,
-      nullptr, Norm2AccumulatorGetter<16>::create(x), "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 10>(
+      x, source, line, dim, nullptr, Norm2Accumulator<10>{x}, "NORM2");
 }
 #endif
 
diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h
new file mode 100644
index 00000000000000..b16440dbc2241a
--- /dev/null
+++ b/flang/runtime/numeric-templates.h
@@ -0,0 +1,339 @@
+//===-- runtime/numeric-templates.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Generic class and function templates used for implementing
+// various numeric intrinsics (EXPONENT, FRACTION, etc.).
+//
+// This header file also defines generic templates for "basic"
+// math operations like abs, isnan, etc. The Float128Math
+// library provides specializations for these templates
+// for the data type corresponding to CppTypeFor<TypeCategory::Real, 16>
+// on the target.
+
+#ifndef FORTRAN_RUNTIME_NUMERIC_TEMPLATES_H_
+#define FORTRAN_RUNTIME_NUMERIC_TEMPLATES_H_
+
+#include "terminator.h"
+#include "tools.h"
+#include "flang/Common/float128.h"
+#include <cstdint>
+#include <limits>
+
+namespace Fortran::runtime {
+
+// MAX/MIN/LOWEST values for different data types.
+
+// MaxOrMinIdentity returns MAX or LOWEST value of the given type.
+template <TypeCategory CAT, int KIND, bool IS_MAXVAL, typename Enable = void>
+struct MaxOrMinIdentity {
+  using Type = CppTypeFor<CAT, KIND>;
+  static constexpr RT_API_ATTRS Type Value() {
+    return IS_MAXVAL ? std::numeric_limits<Type>::lowest()
+                     : std::numeric_limits<Type>::max();
+  }
+};
+
+// std::numeric_limits<> may not know int128_t
+template <bool IS_MAXVAL>
+struct MaxOrMinIdentity<TypeCategory::Integer, 16, IS_MAXVAL> {
+  using Type = CppTypeFor<TypeCategory::Integer, 16>;
+  static constexpr RT_API_ATTRS Type Value() {
+    return IS_MAXVAL ? Type{1} << 127 : ~Type{0} >> 1;
+  }
+};
+
+#if HAS_FLOAT128
+// std::numeric_limits<> may not support __float128.
+//
+// Usage of GCC quadmath.h's FLT128_MAX is complicated by the fact that
+// even GCC complains about 'Q' literal suffix under -Wpedantic.
+// We just recreate FLT128_MAX ourselves.
+//
+// This specialization must engage only when
+// CppTypeFor<TypeCategory::Real, 16> is __float128.
+template <bool IS_MAXVAL>
+struct MaxOrMinIdentity<TypeCategory::Real, 16, IS_MAXVAL,
+    typename std::enable_if_t<
+        std::is_same_v<CppTypeFor<TypeCategory::Real, 16>, __float128>>> {
+  using Type = __float128;
+  static RT_API_ATTRS Type Value() {
+    // Create a buffer to store binary representation of __float128 constant.
+    constexpr std::size_t alignment =
+        std::max(alignof(Type), alignof(std::uint64_t));
+    alignas(alignment) char data[sizeof(Type)];
+
+    // First, verify that our interpretation of __float128 format is correct,
+    // e.g. by checking at least one known constant.
+    *reinterpret_cast<Type *>(data) = Type(1.0);
+    if (*reinterpret_cast<std::uint64_t *>(data) != 0 ||
+        *(reinterpret_cast<std::uint64_t *>(data) + 1) != 0x3FFF000000000000) {
+      Terminator terminator{__FILE__, __LINE__};
+      terminator.Crash("not yet implemented: no full support for __float128");
+    }
+
+    // Recreate FLT128_MAX.
+    *reinterpret_cast<std::uint64_t *>(data) = 0xFFFFFFFFFFFFFFFF;
+    *(reinterpret_cast<std::uint64_t *>(data) + 1) = 0x7FFEFFFFFFFFFFFF;
+    Type max = *reinterpret_cast<Type *>(data);
+    return IS_MAXVAL ? -max : max;
+  }
+};
+#endif // HAS_FLOAT128
+
+// Minimum finite representable value.
+// For floating-point types, returns minimum positive normalized value.
+template <typename T> struct MinValue {
+  static RT_API_ATTRS T get() { return std::numeric_limits<T>::min(); }
+};
+
+#if HAS_FLOAT128
+template <> struct MinValue<CppTypeFor<TypeCategory::Real, 16>> {
+  using Type = CppTypeFor<TypeCategory::Real, 16>;
+  static RT_API_ATTRS Type get() {
+    // Create a buffer to store binary representation of __float128 constant.
+    constexpr std::size_t alignment =
+        std::max(alignof(Type), alignof(std::uint64_t));
+    alignas(alignment) char data[sizeof(Type)];
+
+    // First, verify that our interpretation of __float128 format is correct,
+    // e.g. by checking at least one known constant.
+    *reinterpret_cast<Type *>(data) = Type(1.0);
+    if (*reinterpret_cast<std::uint64_t *>(data) != 0 ||
+        *(reinterpret_cast<std::uint64_t *>(data) + 1) != 0x3FFF000000000000) {
+      Terminator terminator{__FILE__, __LINE__};
+      terminator.Crash("not yet implemented: no full support for __float128");
+    }
+
+    // Recreate FLT128_MIN.
+    *reinterpret_cast<std::uint64_t *>(data) = 0;
+    *(reinterpret_cast<std::uint64_t *>(data) + 1) = 0x1000000000000;
+    return *reinterpret_cast<Type *>(data);
+  }
+};
+#endif // HAS_FLOAT128
+
+template <typename T> struct ABSTy {
+  static constexpr RT_API_ATTRS T compute(T x) { return std::abs(x); }
+};
+
+template <typename T> struct FREXPTy {
+  static constexpr RT_API_ATTRS T compute(T x, int *e) {
+    return std::frexp(x, e);
+  }
+};
+
+template <typename T> struct ILOGBTy {
+  static constexpr RT_API_ATTRS int compute(T x) { return std::ilogb(x); }
+};
+
+template <typename T> struct ISINFTy {
+  static constexpr RT_API_ATTRS bool compute(T x) { return std::isinf(x); }
+};
+
+template <typename T> struct ISNANTy {
+  static constexpr RT_API_ATTRS bool compute(T x) { return std::isnan(x); }
+};
+
+template <typename T> struct LDEXPTy {
+  template <typename ET> static constexpr RT_API_ATTRS T compute(T x, ET e) {
+    return std::ldexp(x, e);
+  }
+};
+
+template <typename T> struct MAXTy {
+  static constexpr RT_API_ATTRS T compute() {
+    return std::numeric_limits<T>::max();
+  }
+};
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+template <> struct MAXTy<CppTypeFor<TypeCategory::Real, 16>> {
+  static CppTypeFor<TypeCategory::Real, 16> compute() {
+    return MaxOrMinIdentity<TypeCategory::Real, 16, true>::Value();
+  }
+};
+#endif
+
+template <typename T> struct MINTy {
+  static constexpr RT_API_ATTRS T compute() { return MinValue<T>::get(); }
+};
+
+template <typename T> struct QNANTy {
+  static constexpr RT_API_ATTRS T compute() {
+    return std::numeric_limits<T>::quiet_NaN();
+  }
+};
+
+template <typename T> struct SQRTTy {
+  static constexpr RT_API_ATTRS T compute(T x) { return std::sqrt(x); }
+};
+
+// EXPONENT (16.9.75)
+template <typename RESULT, typename ARG>
+inline RT_API_ATTRS RESULT Exponent(ARG x) {
+  if (ISINFTy<ARG>::compute(x) || ISNANTy<ARG>::compute(x)) {
+    return MAXTy<RESULT>::compute(); // +/-Inf, NaN -> HUGE(0)
+  } else if (x == 0) {
+    return 0; // 0 -> 0
+  } else {
+    return ILOGBTy<ARG>::compute(x) + 1;
+  }
+}
+
+// Suppress the warnings about calling __host__-only std::frexp,
+// defined in C++ STD header files, from __device__ code.
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
+// FRACTION (16.9.80)
+template <typename T> inline RT_API_ATTRS T Fraction(T x) {
+  if (ISNANTy<T>::compute(x)) {
+    return x; // NaN -> same NaN
+  } else if (ISINFTy<T>::compute(x)) {
+    return QNANTy<T>::compute(); // +/-Inf -> NaN
+  } else if (x == 0) {
+    return x; // 0 -> same 0
+  } else {
+    int ignoredExp;
+    return FREXPTy<T>::compute(x, &ignoredExp);
+  }
+}
+
+RT_DIAG_POP
+
+// SET_EXPONENT (16.9.171)
+template <typename T> inline RT_API_ATTRS T SetExponent(T x, std::int64_t p) {
+  if (ISNANTy<T>::compute(x)) {
+    return x; // NaN -> same NaN
+  } else if (ISINFTy<T>::compute(x)) {
+    return QNANTy<T>::compute(); // +/-Inf -> NaN
+  } else if (x == 0) {
+    return x; // return negative zero if x is negative zero
+  } else {
+    int expo{ILOGBTy<T>::compute(x) + 1};
+    auto ip{static_cast<int>(p - expo)};
+    if (ip != p - expo) {
+      ip = p < 0 ? std::numeric_limits<int>::min()
+                 : std::numeric_limits<int>::max();
+    }
+    return LDEXPTy<T>::compute(x, ip); // x*2**(p-e)
+  }
+}
+
+// MOD & MODULO (16.9.135, .136)
+template <bool IS_MODULO, typename T>
+inline RT_API_ATTRS T RealMod(
+    T a, T p, const char *sourceFile, int sourceLine) {
+  if (p == 0) {
+    Terminator{sourceFile, sourceLine}.Crash(
+        IS_MODULO ? "MODULO with P==0" : "MOD with P==0");
+  }
+  if (ISNANTy<T>::compute(a) || ISNANTy<T>::compute(p) ||
+      ISINFTy<T>::compute(a)) {
+    return QNANTy<T>::compute();
+  } else if (ISINFTy<T>::compute(p)) {
+    return a;
+  }
+  T aAbs{ABSTy<T>::compute(a)};
+  T pAbs{ABSTy<T>::compute(p)};
+  if (aAbs <= static_cast<T>(std::numeric_limits<std::int64_t>::max()) &&
+      pAbs <= static_cast<T>(std::numeric_limits<std::int64_t>::max())) {
+    if (auto aInt{static_cast<std::int64_t>(a)}; a == aInt) {
+      if (auto pInt{static_cast<std::int64_t>(p)}; p == pInt) {
+        // Fast exact case for integer operands
+        auto mod{aInt - (aInt / pInt) * pInt};
+        if (IS_MODULO && (aInt > 0) != (pInt > 0)) {
+          mod += pInt;
+        }
+        return static_cast<T>(mod);
+      }
+    }
+  }
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double> ||
+      std::is_same_v<T, long double>) {
+    // std::fmod() semantics on signed operands seems to match
+    // the requirements of MOD().  MODULO() needs adjustment.
+    T result{std::fmod(a, p)};
+    if constexpr (IS_MODULO) {
+      if ((a < 0) != (p < 0)) {
+        if (result == 0.) {
+          result = -result;
+        } else {
+          result += p;
+        }
+      }
+    }
+    return result;
+  } else {
+    // The standard defines MOD(a,p)=a-AINT(a/p)*p and
+    // MODULO(a,p)=a-FLOOR(a/p)*p, but those definitions lose
+    // precision badly due to cancellation when ABS(a) is
+    // much larger than ABS(p).
+    // Insights:
+    //  - MOD(a,p)=MOD(a-n*p,p) when a>0, p>0, integer n>0, and a>=n*p
+    //  - when n is a power of two, n*p is exact
+    //  - as a>=n*p, a-n*p does not round.
+    // So repeatedly reduce a by all n*p in decreasing order of n;
+    // what's left is the desired remainder.  This is basically
+    // the same algorithm as arbitrary precision binary long division,
+    // discarding the quotient.
+    T tmp{aAbs};
+    for (T adj{SetExponent(pAbs, Exponent<int>(aAbs))}; tmp >= pAbs; adj /= 2) {
+      if (tmp >= adj) {
+        tmp -= adj;
+        if (tmp == 0) {
+          break;
+        }
+      }
+    }
+    if (a < 0) {
+      tmp = -tmp;
+    }
+    if constexpr (IS_MODULO) {
+      if ((a < 0) != (p < 0)) {
+        tmp += p;
+      }
+    }
+    return tmp;
+  }
+}
+
+// RRSPACING (16.9.164)
+template <int PREC, typename T> inline RT_API_ATTRS T RRSpacing(T x) {
+  if (ISNANTy<T>::compute(x)) {
+    return x; // NaN -> same NaN
+  } else if (ISINFTy<T>::compute(x)) {
+    return QNANTy<T>::compute(); // +/-Inf -> NaN
+  } else if (x == 0) {
+    return 0; // 0 -> 0
+  } else {
+    return LDEXPTy<T>::compute(
+        ABSTy<T>::compute(x), PREC - (ILOGBTy<T>::compute(x) + 1));
+  }
+}
+
+// SPACING (16.9.180)
+template <int PREC, typename T> inline RT_API_ATTRS T Spacing(T x) {
+  if (ISNANTy<T>::compute(x)) {
+    return x; // NaN -> same NaN
+  } else if (ISINFTy<T>::compute(x)) {
+    return QNANTy<T>::compute(); // +/-Inf -> NaN
+  } else if (x == 0) {
+    // The standard-mandated behavior seems broken, since TINY() can't be
+    // subnormal.
+    return MINTy<T>::compute(); // 0 -> TINY(x)
+  } else {
+    T result{LDEXPTy<T>::compute(
+        static_cast<T>(1.0), ILOGBTy<T>::compute(x) + 1 - PREC)}; // 2**(e-p)
+    return result == 0 ? /*TINY(x)*/ MINTy<T>::compute() : result;
+  }
+}
+
+} // namespace Fortran::runtime
+
+#endif // FORTRAN_RUNTIME_NUMERIC_TEMPLATES_H_
diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
index ede00d69f20e25..abd3e500029fe4 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang/runtime/numeric.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/numeric.h"
+#include "numeric-templates.h"
 #include "terminator.h"
 #include "flang/Common/float128.h"
 #include <cfloat>
@@ -68,58 +69,6 @@ inline RT_API_ATTRS RESULT Floor(ARG x) {
   return std::floor(x);
 }
 
-// EXPONENT (16.9.75)
-template <typename RESULT, typename ARG>
-inline RT_API_ATTRS RESULT Exponent(ARG x) {
-  if (std::isinf(x) || std::isnan(x)) {
-    return std::numeric_limits<RESULT>::max(); // +/-Inf, NaN -> HUGE(0)
-  } else if (x == 0) {
-    return 0; // 0 -> 0
-  } else {
-    return std::ilogb(x) + 1;
-  }
-}
-
-// Suppress the warnings about calling __host__-only std::frexp,
-// defined in C++ STD header files, from __device__ code.
-RT_DIAG_PUSH
-RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
-
-// FRACTION (16.9.80)
-template <typename T> inline RT_API_ATTRS T Fraction(T x) {
-  if (std::isnan(x)) {
-    return x; // NaN -> same NaN
-  } else if (std::isinf(x)) {
-    return std::numeric_limits<T>::quiet_NaN(); // +/-Inf -> NaN
-  } else if (x == 0) {
-    return x; // 0 -> same 0
-  } else {
-    int ignoredExp;
-    return std::frexp(x, &ignoredExp);
-  }
-}
-
-RT_DIAG_POP
-
-// SET_EXPONENT (16.9.171)
-template <typename T> inline RT_API_ATTRS T SetExponent(T x, std::int64_t p) {
-  if (std::isnan(x)) {
-    return x; // NaN -> same NaN
-  } else if (std::isinf(x)) {
-    return std::numeric_limits<T>::quiet_NaN(); // +/-Inf -> NaN
-  } else if (x == 0) {
-    return x; // return negative zero if x is negative zero
-  } else {
-    int expo{std::ilogb(x) + 1};
-    auto ip{static_cast<int>(p - expo)};
-    if (ip != p - expo) {
-      ip = p < 0 ? std::numeric_limits<int>::min()
-                 : std::numeric_limits<int>::max();
-    }
-    return std::ldexp(x, ip); // x*2**(p-e)
-  }
-}
-
 // MOD & MODULO (16.9.135, .136)
 template <bool IS_MODULO, typename T>
 inline RT_API_ATTRS T IntMod(T x, T p, const char *sourceFile, int sourceLine) {
@@ -133,94 +82,6 @@ inline RT_API_ATTRS T IntMod(T x, T p, const char *sourceFile, int sourceLine) {
   }
   return mod;
 }
-template <bool IS_MODULO, typename T>
-inline RT_API_ATTRS T RealMod(
-    T a, T p, const char *sourceFile, int sourceLine) {
-  if (p == 0) {
-    Terminator{sourceFile, sourceLine}.Crash(
-        IS_MODULO ? "MODULO with P==0" : "MOD with P==0");
-  }
-  if (std::isnan(a) || std::isnan(p) || std::isinf(a)) {
-    return std::numeric_limits<T>::quiet_NaN();
-  } else if (std::isinf(p)) {
-    return a;
-  }
-  T aAbs{std::abs(a)};
-  T pAbs{std::abs(p)};
-  if (aAbs <= static_cast<T>(std::numeric_limits<std::int64_t>::max()) &&
-      pAbs <= static_cast<T>(std::numeric_limits<std::int64_t>::max())) {
-    if (auto aInt{static_cast<std::int64_t>(a)}; a == aInt) {
-      if (auto pInt{static_cast<std::int64_t>(p)}; p == pInt) {
-        // Fast exact case for integer operands
-        auto mod{aInt - (aInt / pInt) * pInt};
-        if (IS_MODULO && (aInt > 0) != (pInt > 0)) {
-          mod += pInt;
-        }
-        return static_cast<T>(mod);
-      }
-    }
-  }
-  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double> ||
-      std::is_same_v<T, long double>) {
-    // std::fmod() semantics on signed operands seems to match
-    // the requirements of MOD().  MODULO() needs adjustment.
-    T result{std::fmod(a, p)};
-    if constexpr (IS_MODULO) {
-      if ((a < 0) != (p < 0)) {
-        if (result == 0.) {
-          result = -result;
-        } else {
-          result += p;
-        }
-      }
-    }
-    return result;
-  } else {
-    // The standard defines MOD(a,p)=a-AINT(a/p)*p and
-    // MODULO(a,p)=a-FLOOR(a/p)*p, but those definitions lose
-    // precision badly due to cancellation when ABS(a) is
-    // much larger than ABS(p).
-    // Insights:
-    //  - MOD(a,p)=MOD(a-n*p,p) when a>0, p>0, integer n>0, and a>=n*p
-    //  - when n is a power of two, n*p is exact
-    //  - as a>=n*p, a-n*p does not round.
-    // So repeatedly reduce a by all n*p in decreasing order of n;
-    // what's left is the desired remainder.  This is basically
-    // the same algorithm as arbitrary precision binary long division,
-    // discarding the quotient.
-    T tmp{aAbs};
-    for (T adj{SetExponent(pAbs, Exponent<int>(aAbs))}; tmp >= pAbs; adj /= 2) {
-      if (tmp >= adj) {
-        tmp -= adj;
-        if (tmp == 0) {
-          break;
-        }
-      }
-    }
-    if (a < 0) {
-      tmp = -tmp;
-    }
-    if constexpr (IS_MODULO) {
-      if ((a < 0) != (p < 0)) {
-        tmp += p;
-      }
-    }
-    return tmp;
-  }
-}
-
-// RRSPACING (16.9.164)
-template <int PREC, typename T> inline RT_API_ATTRS T RRSpacing(T x) {
-  if (std::isnan(x)) {
-    return x; // NaN -> same NaN
-  } else if (std::isinf(x)) {
-    return std::numeric_limits<T>::quiet_NaN(); // +/-Inf -> NaN
-  } else if (x == 0) {
-    return 0; // 0 -> 0
-  } else {
-    return std::ldexp(std::abs(x), PREC - (std::ilogb(x) + 1));
-  }
-}
 
 // SCALE (16.9.166)
 template <typename T> inline RT_API_ATTRS T Scale(T x, std::int64_t p) {
@@ -229,7 +90,7 @@ template <typename T> inline RT_API_ATTRS T Scale(T x, std::int64_t p) {
     ip = p < 0 ? std::numeric_limits<int>::min()
                : std::numeric_limits<int>::max();
   }
-  return std::ldexp(x, p); // x*2**p
+  return std::ldexp(x, ip); // x*2**p
 }
 
 // SELECTED_INT_KIND (16.9.169)
@@ -300,23 +161,6 @@ inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedRealKind(
   return error ? error : kind;
 }
 
-// SPACING (16.9.180)
-template <int PREC, typename T> inline RT_API_ATTRS T Spacing(T x) {
-  if (std::isnan(x)) {
-    return x; // NaN -> same NaN
-  } else if (std::isinf(x)) {
-    return std::numeric_limits<T>::quiet_NaN(); // +/-Inf -> NaN
-  } else if (x == 0) {
-    // The standard-mandated behavior seems broken, since TINY() can't be
-    // subnormal.
-    return std::numeric_limits<T>::min(); // 0 -> TINY(x)
-  } else {
-    T result{
-        std::ldexp(static_cast<T>(1.0), std::ilogb(x) + 1 - PREC)}; // 2**(e-p)
-    return result == 0 ? /*TINY(x)*/ std::numeric_limits<T>::min() : result;
-  }
-}
-
 // NEAREST (16.9.139)
 template <int PREC, typename T>
 inline RT_API_ATTRS T Nearest(T x, bool positive) {
@@ -480,15 +324,6 @@ CppTypeFor<TypeCategory::Integer, 8> RTDEF(Exponent10_8)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
-#elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Integer, 4> RTDEF(Exponent16_4)(
-    CppTypeFor<TypeCategory::Real, 16> x) {
-  return Exponent<CppTypeFor<TypeCategory::Integer, 4>>(x);
-}
-CppTypeFor<TypeCategory::Integer, 8> RTDEF(Exponent16_8)(
-    CppTypeFor<TypeCategory::Real, 16> x) {
-  return Exponent<CppTypeFor<TypeCategory::Integer, 8>>(x);
-}
 #endif
 
 CppTypeFor<TypeCategory::Integer, 1> RTDEF(Floor4_1)(
@@ -596,11 +431,6 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(Fraction10)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Fraction(x);
 }
-#elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTDEF(Fraction16)(
-    CppTypeFor<TypeCategory::Real, 16> x) {
-  return Fraction(x);
-}
 #endif
 
 bool RTDEF(IsFinite4)(CppTypeFor<TypeCategory::Real, 4> x) {
@@ -683,12 +513,6 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(ModReal10)(
     const char *sourceFile, int sourceLine) {
   return RealMod<false>(x, p, sourceFile, sourceLine);
 }
-#elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTDEF(ModReal16)(
-    CppTypeFor<TypeCategory::Real, 16> x, CppTypeFor<TypeCategory::Real, 16> p,
-    const char *sourceFile, int sourceLine) {
-  return RealMod<false>(x, p, sourceFile, sourceLine);
-}
 #endif
 
 CppTypeFor<TypeCategory::Integer, 1> RTDEF(ModuloInteger1)(
@@ -739,12 +563,6 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(ModuloReal10)(
     const char *sourceFile, int sourceLine) {
   return RealMod<true>(x, p, sourceFile, sourceLine);
 }
-#elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTDEF(ModuloReal16)(
-    CppTypeFor<TypeCategory::Real, 16> x, CppTypeFor<TypeCategory::Real, 16> p,
-    const char *sourceFile, int sourceLine) {
-  return RealMod<true>(x, p, sourceFile, sourceLine);
-}
 #endif
 
 CppTypeFor<TypeCategory::Real, 4> RTDEF(Nearest4)(
@@ -760,11 +578,6 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(Nearest10)(
     CppTypeFor<TypeCategory::Real, 10> x, bool positive) {
   return Nearest<64>(x, positive);
 }
-#elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTDEF(Nearest16)(
-    CppTypeFor<TypeCategory::Real, 16> x, bool positive) {
-  return Nearest<113>(x, positive);
-}
 #endif
 
 CppTypeFor<TypeCategory::Integer, 1> RTDEF(Nint4_1)(
@@ -872,11 +685,6 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(RRSpacing10)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return RRSpacing<64>(x);
 }
-#elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTDEF(RRSpacing16)(
-    CppTypeFor<TypeCategory::Real, 16> x) {
-  return RRSpacing<113>(x);
-}
 #endif
 
 CppTypeFor<TypeCategory::Real, 4> RTDEF(SetExponent4)(
@@ -892,11 +700,6 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(SetExponent10)(
     CppTypeFor<TypeCategory::Real, 10> x, std::int64_t p) {
   return SetExponent(x, p);
 }
-#elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTDEF(SetExponent16)(
-    CppTypeFor<TypeCategory::Real, 16> x, std::int64_t p) {
-  return SetExponent(x, p);
-}
 #endif
 
 CppTypeFor<TypeCategory::Real, 4> RTDEF(Scale4)(
@@ -912,11 +715,6 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(Scale10)(
     CppTypeFor<TypeCategory::Real, 10> x, std::int64_t p) {
   return Scale(x, p);
 }
-#elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTDEF(Scale16)(
-    CppTypeFor<TypeCategory::Real, 16> x, std::int64_t p) {
-  return Scale(x, p);
-}
 #endif
 
 // SELECTED_INT_KIND
@@ -971,11 +769,6 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(Spacing10)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Spacing<64>(x);
 }
-#elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTDEF(Spacing16)(
-    CppTypeFor<TypeCategory::Real, 16> x) {
-  return Spacing<113>(x);
-}
 #endif
 
 CppTypeFor<TypeCategory::Real, 4> RTDEF(FPow4i)(
diff --git a/flang/runtime/reduction-templates.h b/flang/runtime/reduction-templates.h
index 0891bc021ff753..5b793deb2a123d 100644
--- a/flang/runtime/reduction-templates.h
+++ b/flang/runtime/reduction-templates.h
@@ -1,4 +1,4 @@
-//===-- runtime/reduction-templates.h -------------------------------------===//
+//===-- runtime/reduction-templates.h ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -21,6 +21,7 @@
 #ifndef FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_
 #define FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_
 
+#include "numeric-templates.h"
 #include "terminator.h"
 #include "tools.h"
 #include "flang/Runtime/cpp-type.h"
@@ -385,7 +386,7 @@ template <int KIND>
 using Norm2AccumType =
     CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, Norm2LargestLDKind)>;
 
-template <int KIND, typename ABS, typename SQRT> class Norm2Accumulator {
+template <int KIND> class Norm2Accumulator {
 public:
   using Type = CppTypeFor<TypeCategory::Real, KIND>;
   using AccumType = Norm2AccumType<KIND>;
@@ -395,10 +396,10 @@ template <int KIND, typename ABS, typename SQRT> class Norm2Accumulator {
   template <typename A>
   RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     // m * sqrt(1 + sum((others(:)/m)**2))
-    *p = static_cast<Type>(max_ * SQRT::compute(1 + sum_));
+    *p = static_cast<Type>(max_ * SQRTTy<AccumType>::compute(1 + sum_));
   }
   RT_API_ATTRS bool Accumulate(Type x) {
-    auto absX{ABS::compute(static_cast<AccumType>(x))};
+    auto absX{ABSTy<AccumType>::compute(static_cast<AccumType>(x))};
     if (!max_) {
       max_ = absX;
     } else if (absX > max_) {
@@ -424,27 +425,12 @@ template <int KIND, typename ABS, typename SQRT> class Norm2Accumulator {
   AccumType sum_{0}; // sum((others(:)/m)**2)
 };
 
-// Helper class for creating Norm2Accumulator instance
-// based on the given KIND. This helper returns and instance
-// that uses std::abs and std::sqrt for the computations.
-template <int KIND> class Norm2AccumulatorGetter {
-  using AccumType = Norm2AccumType<KIND>;
-
-public:
-  struct ABSTy {
-    static constexpr RT_API_ATTRS AccumType compute(AccumType &&x) {
-      return std::abs(std::forward<AccumType>(x));
-    }
-  };
-  struct SQRTTy {
-    static constexpr RT_API_ATTRS AccumType compute(AccumType &&x) {
-      return std::sqrt(std::forward<AccumType>(x));
-    }
-  };
-
-  using Type = Norm2Accumulator<KIND, ABSTy, SQRTTy>;
-
-  static RT_API_ATTRS Type create(const Descriptor &x) { return Type(x); }
+template <int KIND> struct Norm2Helper {
+  RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, int dim,
+      const Descriptor *mask, Terminator &terminator) const {
+    DoMaxMinNorm2<TypeCategory::Real, KIND, Norm2Accumulator<KIND>>(
+        result, x, dim, mask, "NORM2", terminator);
+  }
 };
 
 } // namespace Fortran::runtime

From 0869ffa6bd6e9562b35bc93e48207d11261d67df Mon Sep 17 00:00:00 2001
From: SunilKuravinakop <98882378+SunilKuravinakop@users.noreply.github.com>
Date: Thu, 29 Feb 2024 22:41:29 +0530
Subject: [PATCH 45/55] [OpenMP] Clang Codegen Interop : Accept multiple use &
 destroy clauses (#83398)

Modified clang/lib/CodeGen/CGStmtOpenMP.cpp to accept multiple use &
destroy clauses with interop directive.
Modified clang/test/OpenMP/interop_codegen.cpp to check for the changes.

Co-authored-by: Sunil Kuravinakop <kuravina@pe28vega.us.cray.com>
---
 clang/lib/CodeGen/CGStmtOpenMP.cpp    | 34 +++++++++++++++++----------
 clang/test/OpenMP/interop_codegen.cpp | 22 ++++++++++++-----
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index ffcd3ae2711e34..3fbd2e03eb61df 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -7042,18 +7042,28 @@ void CodeGenFunction::EmitOMPInteropDirective(const OMPInteropDirective &S) {
                                       Device, NumDependences, DependenceList,
                                       Data.HasNowaitClause);
     }
-  } else if (const auto *C = S.getSingleClause<OMPDestroyClause>()) {
-    llvm::Value *InteropvarPtr =
-        EmitLValue(C->getInteropVar()).getPointer(*this);
-    OMPBuilder.createOMPInteropDestroy(Builder, InteropvarPtr, Device,
-                                       NumDependences, DependenceList,
-                                       Data.HasNowaitClause);
-  } else if (const auto *C = S.getSingleClause<OMPUseClause>()) {
-    llvm::Value *InteropvarPtr =
-        EmitLValue(C->getInteropVar()).getPointer(*this);
-    OMPBuilder.createOMPInteropUse(Builder, InteropvarPtr, Device,
-                                   NumDependences, DependenceList,
-                                   Data.HasNowaitClause);
+  }
+  auto ItOMPDestroyClause = S.getClausesOfKind<OMPDestroyClause>();
+  if (!ItOMPDestroyClause.empty()) {
+    // Look at the multiple destroy clauses
+    for (const OMPDestroyClause *C : ItOMPDestroyClause) {
+      llvm::Value *InteropvarPtr =
+          EmitLValue(C->getInteropVar()).getPointer(*this);
+      OMPBuilder.createOMPInteropDestroy(Builder, InteropvarPtr, Device,
+                                         NumDependences, DependenceList,
+                                         Data.HasNowaitClause);
+    }
+  }
+  auto ItOMPUseClause = S.getClausesOfKind<OMPUseClause>();
+  if (!ItOMPUseClause.empty()) {
+    // Look at the multiple use clauses
+    for (const OMPUseClause *C : ItOMPUseClause) {
+      llvm::Value *InteropvarPtr =
+          EmitLValue(C->getInteropVar()).getPointer(*this);
+      OMPBuilder.createOMPInteropUse(Builder, InteropvarPtr, Device,
+                                     NumDependences, DependenceList,
+                                     Data.HasNowaitClause);
+    }
   }
 }
 
diff --git a/clang/test/OpenMP/interop_codegen.cpp b/clang/test/OpenMP/interop_codegen.cpp
index ea83ef8ed4909f..31df2f1ba58c5f 100644
--- a/clang/test/OpenMP/interop_codegen.cpp
+++ b/clang/test/OpenMP/interop_codegen.cpp
@@ -15,21 +15,31 @@ typedef long omp_intptr_t;
 extern omp_intptr_t omp_get_interop_int(const omp_interop_t, int, int *);
 
 int main() {
-  omp_interop_t obj = omp_interop_none;
+  omp_interop_t obj1 = omp_interop_none;
+  omp_interop_t obj2 = omp_interop_none;
   omp_interop_t i1 = omp_interop_none;
   omp_interop_t i2 = omp_interop_none;
   omp_interop_t i3 = omp_interop_none;
   omp_interop_t i4 = omp_interop_none;
   omp_interop_t i5 = omp_interop_none;
 
-  #pragma omp interop init(targetsync: i1) init(targetsync: obj)
-  int id = (int )omp_get_interop_int(obj, omp_ipr_fr_id, NULL);
-  int id1 = (int )omp_get_interop_int(i1, omp_ipr_fr_id, NULL);
+  #pragma omp interop init(targetsync: obj1) init(targetsync: obj2)
+  int id = (int )omp_get_interop_int(obj1, omp_ipr_fr_id, NULL);
+  int id1 = (int )omp_get_interop_int(obj2, omp_ipr_fr_id, NULL);
+
+  #pragma omp interop init(target,targetsync: i1) use(i2) use(i3) destroy(i4) destroy(i5)
+  int id2 = (int )omp_get_interop_int(i1, omp_ipr_fr_id, NULL);
+  int id3 = (int )omp_get_interop_int(i2, omp_ipr_fr_id, NULL);
 
 
 }
 #endif
 
-// CHECK-LABEL: define {{.+}}main{{.+}} 
+// CHECK-LABEL: define {{.+}}main{{.+}}
+// CHECK: call {{.+}}__tgt_interop_init({{.+}}obj1{{.*}})
+// CHECK: call {{.+}}__tgt_interop_init({{.+}}obj2{{.*}})
 // CHECK: call {{.+}}__tgt_interop_init({{.+}}i1{{.*}})
-// CHECK: call {{.+}}__tgt_interop_init({{.+}}obj{{.*}})
+// CHECK: call {{.+}}__tgt_interop_destroy({{.+}}i4{{.*}})
+// CHECK: call {{.+}}__tgt_interop_destroy({{.+}}i5{{.*}})
+// CHECK: call {{.+}}__tgt_interop_use({{.+}}i2{{.*}})
+// CHECK: call {{.+}}__tgt_interop_use({{.+}}i3{{.*}})

From a6d9b7ba2722953960b83fbacda1757349f00156 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 29 Feb 2024 09:11:29 -0800
Subject: [PATCH 46/55] [lldb] Don't cache lldb_find_python_module result

Don't cache lldb_find_python_module result as that requires you to do a
clean build after installing the dependency.
---
 lldb/cmake/modules/AddLLDB.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lldb/cmake/modules/AddLLDB.cmake b/lldb/cmake/modules/AddLLDB.cmake
index 328e883ddbe5a6..fdc4ee0c05d755 100644
--- a/lldb/cmake/modules/AddLLDB.cmake
+++ b/lldb/cmake/modules/AddLLDB.cmake
@@ -383,7 +383,7 @@ endfunction()
 
 function(lldb_find_python_module module)
   set(MODULE_FOUND PY_${module}_FOUND)
-  if (DEFINED ${MODULE_FOUND})
+  if (${MODULE_FOUND})
     return()
   endif()
 
@@ -392,10 +392,10 @@ function(lldb_find_python_module module)
     ERROR_QUIET)
 
   if (status)
-    set(${MODULE_FOUND} OFF CACHE BOOL "Failed to find python module '${module}'")
+    set(${MODULE_FOUND} OFF PARENT_SCOPE)
     message(STATUS "Could NOT find Python module '${module}'")
   else()
-    set(${MODULE_FOUND} ON CACHE BOOL "Found python module '${module}'")
+    set(${MODULE_FOUND} ON PARENT_SCOPE)
     message(STATUS "Found Python module '${module}'")
   endif()
 endfunction()

From 3e0425c76c894c2ed538978140f59bd4a14acabd Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 29 Feb 2024 09:06:38 -0800
Subject: [PATCH 47/55] [SLP][NFC]Add a test showing incorrect transformation,
 NFC.

---
 .../SLPVectorizer/X86/reorder-node.ll         | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll
new file mode 100644
index 00000000000000..c45e1e8b17bf8c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr noalias %arg, ptr noalias %arg1, ptr %arg2) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr noalias [[ARG:%.*]], ptr noalias [[ARG1:%.*]], ptr [[ARG2:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP_I_I:%.*]] = getelementptr i8, ptr [[ARG1]], i64 24
+; CHECK-NEXT:    [[TMP_I_I4:%.*]] = getelementptr i8, ptr [[ARG]], i64 24
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[TMP_I_I]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    store float [[TMP1]], ptr [[ARG2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[TMP3]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    store <4 x float> [[TMP6]], ptr [[TMP_I_I4]], align 8
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp.i.i = getelementptr i8, ptr %arg1, i64 24
+  %tmp1.i.i = load float, ptr %tmp.i.i, align 8
+  %tmp.i.i2 = getelementptr i8, ptr %arg1, i64 32
+  %tmp1.i.i3 = load float, ptr %tmp.i.i2, align 8
+  %tmp1.i.i.i = fcmp olt float %tmp1.i.i3, 0.000000e+00
+  %tmp9 = select i1 %tmp1.i.i.i, float %tmp1.i.i3, float %tmp1.i.i
+  %tmp.i.i4 = getelementptr i8, ptr %arg, i64 24
+  store float %tmp9, ptr %tmp.i.i4, align 8
+  %tmp1.i.i.i10 = fcmp olt float %tmp1.i.i, 0.000000e+00
+  %tmp13 = select i1 %tmp1.i.i.i10, float %tmp1.i.i3, float %tmp1.i.i
+  %tmp.i.i12 = getelementptr i8, ptr %arg, i64 28
+  store float %tmp13, ptr %tmp.i.i12, align 4
+  %tmp.i.i13 = getelementptr i8, ptr %arg1, i64 28
+  %tmp1.i.i14 = load float, ptr %tmp.i.i13, align 4
+  %tmp.i.i15 = getelementptr i8, ptr %arg1, i64 36
+  %tmp1.i.i16 = load float, ptr %tmp.i.i15, align 4
+  %tmp1.i.i.i18 = fcmp olt float %tmp1.i.i16, 0.000000e+00
+  %tmp17 = select i1 %tmp1.i.i.i18, float %tmp1.i.i16, float %tmp1.i.i14
+  %tmp.i.i20 = getelementptr i8, ptr %arg, i64 32
+  store float %tmp17, ptr %tmp.i.i20, align 8
+  store float %tmp1.i.i14, ptr %arg2, align 4
+  %tmp1.i.i.i24 = fcmp olt float %tmp1.i.i14, 0.000000e+00
+  %tmp20 = select i1 %tmp1.i.i.i24, float %tmp1.i.i16, float %tmp1.i.i14
+  %tmp.i.i26 = getelementptr i8, ptr %arg, i64 36
+  store float %tmp20, ptr %tmp.i.i26, align 4
+  ret void
+}

From be8f987d8cddd2d1df5ed1afff64fd9a730ec97a Mon Sep 17 00:00:00 2001
From: Jason Eckhardt <jeckhardt@nvidia.com>
Date: Thu, 29 Feb 2024 11:57:06 -0600
Subject: [PATCH 48/55] [TableGen] Use consistent field kind checking in
 getNumericKey. (#83284)

Fields GenericField::IsInstruction and GenericField::Enum can be
simultaneously active for a given field. Methods compareBy and
primaryRepresentation both order the checking of IsInstruction before
GenericField::Enum. Do the same in getNumericKey for consistency.
---
 llvm/utils/TableGen/SearchableTableEmitter.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp
index 5bab4ff188e8ed..51f18f360ed311 100644
--- a/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -215,12 +215,15 @@ int64_t SearchableTableEmitter::getNumericKey(const SearchIndex &Index,
                                               Record *Rec) {
   assert(Index.Fields.size() == 1);
 
+  // To be consistent with compareBy and primaryRepresentation elsewhere,
+  // we check for IsInstruction before Enum-- these fields are not exclusive.
+  if (Index.Fields[0].IsInstruction) {
+    Record *TheDef = Rec->getValueAsDef(Index.Fields[0].Name);
+    return Target->getInstrIntValue(TheDef);
+  }
   if (Index.Fields[0].Enum) {
     Record *EnumEntry = Rec->getValueAsDef(Index.Fields[0].Name);
     return Index.Fields[0].Enum->EntryMap[EnumEntry]->second;
-  } else if (Index.Fields[0].IsInstruction) {
-    Record *TheDef = Rec->getValueAsDef(Index.Fields[0].Name);
-    return Target->getInstrIntValue(TheDef);
   }
 
   return getInt(Rec, Index.Fields[0].Name);

From 856ce495e52b8ea8855d7f34d84346c82b1fddc2 Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh96@gmail.com>
Date: Thu, 29 Feb 2024 10:06:25 -0800
Subject: [PATCH 49/55] ANDROID: AArch64: Change default max-page-size from 4k
 to 16k (#70251)

File size increase were found negligible from sparseness and zero
block compression in f2fs and ext4 (supported filesystems for
Android's userdata partition).

Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Co-authored-by: Kalesh Singh <kaleshsingh@google.com>
---
 clang/lib/Driver/ToolChains/Linux.cpp | 17 ++++++++++++-----
 clang/test/Driver/android-link.cpp    |  3 ++-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 4300a2bdff1791..dc09b13351f46f 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -237,11 +237,18 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
     ExtraOpts.push_back("relro");
   }
 
-  // Android ARM/AArch64 use max-page-size=4096 to reduce VMA usage. Note, lld
-  // from 11 onwards default max-page-size to 65536 for both ARM and AArch64.
-  if ((Triple.isARM() || Triple.isAArch64()) && Triple.isAndroid()) {
-    ExtraOpts.push_back("-z");
-    ExtraOpts.push_back("max-page-size=4096");
+  // Note, lld from 11 onwards default max-page-size to 65536 for both ARM and
+  // AArch64.
+  if (Triple.isAndroid()) {
+    if (Triple.isARM()) {
+      // Android ARM uses max-page-size=4096 to reduce VMA usage.
+      ExtraOpts.push_back("-z");
+      ExtraOpts.push_back("max-page-size=4096");
+    } else if (Triple.isAArch64()) {
+      // Android AArch64 uses max-page-size=16384 to support 4k/16k page sizes.
+      ExtraOpts.push_back("-z");
+      ExtraOpts.push_back("max-page-size=16384");
+    }
   }
 
   if (GCCInstallation.getParentLibPath().contains("opt/rh/"))
diff --git a/clang/test/Driver/android-link.cpp b/clang/test/Driver/android-link.cpp
index fa9cbc5d0c7a55..f9bdd00507d7bc 100644
--- a/clang/test/Driver/android-link.cpp
+++ b/clang/test/Driver/android-link.cpp
@@ -17,9 +17,10 @@
 //
 // RUN: %clang -target aarch64-none-linux-android \
 // RUN:   -### -v %s 2> %t
-// RUN: FileCheck -check-prefix=MAX-PAGE-SIZE < %t %s
+// RUN: FileCheck -check-prefix=MAX-PAGE-SIZE-AARCH64 < %t %s
 //
 // GENERIC-ARM: --fix-cortex-a53-843419
 // CORTEX-A53: --fix-cortex-a53-843419
 // CORTEX-A57-NOT: --fix-cortex-a53-843419
 // MAX-PAGE-SIZE: "-z" "max-page-size=4096"
+// MAX-PAGE-SIZE-AARCH64: "-z" "max-page-size=16384"

From 489eadd142e858dc28e375320da774eba53d21bb Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com>
Date: Thu, 29 Feb 2024 13:40:38 -0500
Subject: [PATCH 50/55] [HLSL] Implementation of the frac intrinsic (#83315)

This change implements the frontend  for #70099
Builtins.td           - add the frac builtin
CGBuiltin.cpp         - add the builtin to DirectX intrinsic mapping
hlsl_intrinsics.h     - add the frac api
SemaChecking.cpp      - add type checks for builtin
IntrinsicsDirectX.td  - add the frac intrinsic

The backend changes for this are going to be very simple:
https://github.com/llvm/llvm-project/commit/f309a0eb558b65dfaff0d1d23b7d07fb07e27121

They were not included because llvm/lib/Target/DirectX/DXIL.td is going
through a major refactor.
---
 clang/include/clang/Basic/Builtins.td         |   6 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   2 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |  23 ++-
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  32 ++++
 clang/lib/Sema/SemaChecking.cpp               |  26 +++
 clang/test/CodeGenHLSL/builtins/dot.hlsl      | 160 +++++-------------
 clang/test/CodeGenHLSL/builtins/frac.hlsl     |  53 ++++++
 clang/test/CodeGenHLSL/builtins/lerp.hlsl     |  52 ++----
 clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl  |  85 +++++-----
 clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl |  27 +++
 clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl |  79 +++++----
 .../test/SemaHLSL/OverloadResolutionBugs.hlsl |  64 ++++---
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |   8 +-
 13 files changed, 337 insertions(+), 280 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/frac.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 36151b49d9363d..2c83dca248fb7d 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4536,6 +4536,12 @@ def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLFrac : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_frac"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLLerp : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_lerp"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4ef3ac8c96cd26..4f8902e37bd3bb 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10270,7 +10270,7 @@ def err_vec_builtin_non_vector_all : Error<
  "all arguments to %0 must be vectors">;
 def err_vec_builtin_incompatible_vector_all : Error<
   "all arguments to %0 must have vectors of the same type">;
-  
+
 def err_vec_builtin_non_vector : Error<
  "first two arguments to %0 must be vectors">;
 def err_vec_builtin_incompatible_vector : Error<
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 74ca96117793c4..98684448f4ff5c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18020,14 +18020,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         V = Builder.CreateFMul(S, V);
         return Builder.CreateFAdd(X, V, "dx.lerp");
       }
-      // DXC does this via casting to float should we do the same thing?
-      if (Xty->isIntegerTy()) {
-        auto V = Builder.CreateSub(Y, X);
-        V = Builder.CreateMul(S, V);
-        return Builder.CreateAdd(X, V, "dx.lerp");
-      }
-      // Bools should have been promoted
-      llvm_unreachable("Scalar Lerp is only supported on ints and floats.");
+      llvm_unreachable("Scalar Lerp is only supported on floats.");
     }
     // A VectorSplat should have happened
     assert(Xty->isVectorTy() && Yty->isVectorTy() && Sty->isVectorTy() &&
@@ -18041,12 +18034,24 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         E->getArg(2)->getType()->getAs<VectorType>();
     // A HLSLVectorTruncation should have happend
     assert(XVecTy->getNumElements() == YVecTy->getNumElements() &&
-           SVecTy->getNumElements() &&
+           XVecTy->getNumElements() == SVecTy->getNumElements() &&
            "Lerp requires vectors to be of the same size.");
+    assert(XVecTy->getElementType()->isRealFloatingType() &&
+           XVecTy->getElementType() == YVecTy->getElementType() &&
+           XVecTy->getElementType() == SVecTy->getElementType() &&
+           "Lerp requires float vectors to be of the same type.");
     return Builder.CreateIntrinsic(
         /*ReturnType*/ Xty, Intrinsic::dx_lerp, ArrayRef<Value *>{X, Y, S},
         nullptr, "dx.lerp");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_frac: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("frac operand must have a float representation");
+    return Builder.CreateIntrinsic(
+        /*ReturnType*/ Op0->getType(), Intrinsic::dx_frac,
+        ArrayRef<Value *>{Op0}, nullptr, "dx.frac");
+  }
   }
   return nullptr;
 }
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 1314bdefa37e7b..0aa8651ba80dc4 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -317,6 +317,38 @@ double3 floor(double3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_floor)
 double4 floor(double4);
 
+//===----------------------------------------------------------------------===//
+// frac builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T frac(T x)
+/// \brief Returns the fractional (or decimal) part of x. \a x parameter.
+/// \param x The specified input value.
+///
+/// If \a the return value is greater than or equal to 0 and less than 1.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_frac)
+half frac(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_frac)
+half2 frac(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_frac)
+half3 frac(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_frac)
+half4 frac(half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_frac)
+float frac(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_frac)
+float2 frac(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_frac)
+float3 frac(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_frac)
+float4 frac(float4);
+
 //===----------------------------------------------------------------------===//
 // lerp builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 016e9830662042..690bdaa63d058b 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5246,6 +5246,23 @@ bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
   return true;
 }
 
+bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
+  QualType ExpectedType = S->Context.FloatTy;
+  for (unsigned i = 0; i < TheCall->getNumArgs(); ++i) {
+    QualType PassedType = TheCall->getArg(i)->getType();
+    if (!PassedType->hasFloatingRepresentation()) {
+      if (auto *VecTyA = PassedType->getAs<VectorType>())
+        ExpectedType = S->Context.getVectorType(
+            ExpectedType, VecTyA->getNumElements(), VecTyA->getVectorKind());
+      S->Diag(TheCall->getArg(0)->getBeginLoc(),
+              diag::err_typecheck_convert_incompatible)
+          << PassedType << ExpectedType << 1 << 0 << 0;
+      return true;
+    }
+  }
+  return false;
+}
+
 // Note: returning true in this case results in CheckBuiltinFunctionCall
 // returning an ExprError
 bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
@@ -5259,6 +5276,13 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
+  case Builtin::BI__builtin_hlsl_elementwise_frac: {
+    if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
+      return true;
+    if (CheckAllArgsHaveFloatRepresentation(this, TheCall))
+      return true;
+    break;
+  }
   case Builtin::BI__builtin_hlsl_lerp: {
     if (checkArgCount(*this, TheCall, 3))
       return true;
@@ -5266,6 +5290,8 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     if (SemaBuiltinElementwiseTernaryMath(TheCall))
       return true;
+    if (CheckAllArgsHaveFloatRepresentation(this, TheCall))
+      return true;
     break;
   }
   }
diff --git a/clang/test/CodeGenHLSL/builtins/dot.hlsl b/clang/test/CodeGenHLSL/builtins/dot.hlsl
index b2c1bae31d13b1..c064d118caf3e7 100644
--- a/clang/test/CodeGenHLSL/builtins/dot.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dot.hlsl
@@ -9,230 +9,160 @@
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: %dx.dot = mul i16 %0, %1
 // NATIVE_HALF: ret i16 %dx.dot
-int16_t test_dot_short ( int16_t p0, int16_t p1 ) {
-  return dot ( p0, p1 );
-}
+int16_t test_dot_short(int16_t p0, int16_t p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v2i16(<2 x i16> %0, <2 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
-int16_t test_dot_short2 ( int16_t2 p0, int16_t2 p1 ) {
-  return dot ( p0, p1 );
-}
+int16_t test_dot_short2(int16_t2 p0, int16_t2 p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v3i16(<3 x i16> %0, <3 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
-int16_t test_dot_short3 ( int16_t3 p0, int16_t3 p1 ) {
-  return dot ( p0, p1 );
-}
+int16_t test_dot_short3(int16_t3 p0, int16_t3 p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v4i16(<4 x i16> %0, <4 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
-int16_t test_dot_short4 ( int16_t4 p0, int16_t4 p1 ) {
-  return dot ( p0, p1 );
-}
+int16_t test_dot_short4(int16_t4 p0, int16_t4 p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = mul i16 %0, %1
 // NATIVE_HALF: ret i16 %dx.dot
-uint16_t test_dot_ushort ( uint16_t p0, uint16_t p1 ) {
-  return dot ( p0, p1 );
-}
+uint16_t test_dot_ushort(uint16_t p0, uint16_t p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v2i16(<2 x i16> %0, <2 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
-uint16_t test_dot_ushort2 ( uint16_t2 p0, uint16_t2 p1 ) {
-  return dot ( p0, p1 );
-}
+uint16_t test_dot_ushort2(uint16_t2 p0, uint16_t2 p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v3i16(<3 x i16> %0, <3 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
-uint16_t test_dot_ushort3 ( uint16_t3 p0, uint16_t3 p1 ) {
-  return dot ( p0, p1 );
-}
+uint16_t test_dot_ushort3(uint16_t3 p0, uint16_t3 p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v4i16(<4 x i16> %0, <4 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
-uint16_t test_dot_ushort4 ( uint16_t4 p0, uint16_t4 p1 ) {
-  return dot ( p0, p1 );
-}
+uint16_t test_dot_ushort4(uint16_t4 p0, uint16_t4 p1) { return dot(p0, p1); }
 #endif
 
 // CHECK: %dx.dot = mul i32 %0, %1
 // CHECK: ret i32 %dx.dot
-int test_dot_int ( int p0, int p1 ) {
-  return dot ( p0, p1 );
-}
+int test_dot_int(int p0, int p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i32 @llvm.dx.dot.v2i32(<2 x i32> %0, <2 x i32> %1)
 // CHECK: ret i32 %dx.dot
-int test_dot_int2 ( int2 p0, int2 p1 ) {
-  return dot ( p0, p1 );
-}
+int test_dot_int2(int2 p0, int2 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i32 @llvm.dx.dot.v3i32(<3 x i32> %0, <3 x i32> %1)
 // CHECK: ret i32 %dx.dot
-int test_dot_int3 ( int3 p0, int3 p1 ) {
-  return dot ( p0, p1 );
-}
+int test_dot_int3(int3 p0, int3 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i32 @llvm.dx.dot.v4i32(<4 x i32> %0, <4 x i32> %1)
 // CHECK: ret i32 %dx.dot
-int test_dot_int4 ( int4 p0, int4 p1 ) {
-  return dot ( p0, p1 );
-}
+int test_dot_int4(int4 p0, int4 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = mul i32 %0, %1
 // CHECK: ret i32 %dx.dot
-uint test_dot_uint ( uint p0, uint p1 ) {
-  return dot ( p0, p1 );
-}
+uint test_dot_uint(uint p0, uint p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i32 @llvm.dx.dot.v2i32(<2 x i32> %0, <2 x i32> %1)
 // CHECK: ret i32 %dx.dot
-uint test_dot_uint2 ( uint2 p0, uint2 p1 ) {
-  return dot ( p0, p1 );
-}
+uint test_dot_uint2(uint2 p0, uint2 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i32 @llvm.dx.dot.v3i32(<3 x i32> %0, <3 x i32> %1)
 // CHECK: ret i32 %dx.dot
-uint test_dot_uint3 ( uint3 p0, uint3 p1 ) {
-  return dot ( p0, p1 );
-}
+uint test_dot_uint3(uint3 p0, uint3 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i32 @llvm.dx.dot.v4i32(<4 x i32> %0, <4 x i32> %1)
 // CHECK: ret i32 %dx.dot
-uint test_dot_uint4 ( uint4 p0, uint4 p1 ) {
-  return dot ( p0, p1 );
-}
+uint test_dot_uint4(uint4 p0, uint4 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = mul i64 %0, %1
 // CHECK: ret i64 %dx.dot
-int64_t test_dot_long ( int64_t p0, int64_t p1 ) {
-  return dot ( p0, p1 );
-}
+int64_t test_dot_long(int64_t p0, int64_t p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i64 @llvm.dx.dot.v2i64(<2 x i64> %0, <2 x i64> %1)
 // CHECK: ret i64 %dx.dot
-int64_t test_dot_long2 ( int64_t2 p0, int64_t2 p1 ) {
-  return dot ( p0, p1 );
-}
+int64_t test_dot_long2(int64_t2 p0, int64_t2 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i64 @llvm.dx.dot.v3i64(<3 x i64> %0, <3 x i64> %1)
 // CHECK: ret i64 %dx.dot
-int64_t test_dot_long3 ( int64_t3 p0, int64_t3 p1 ) {
-  return dot ( p0, p1 );
-}
+int64_t test_dot_long3(int64_t3 p0, int64_t3 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i64 @llvm.dx.dot.v4i64(<4 x i64> %0, <4 x i64> %1)
 // CHECK: ret i64 %dx.dot
-int64_t test_dot_long4 ( int64_t4 p0, int64_t4 p1 ) {
-  return dot ( p0, p1 );
-}
+int64_t test_dot_long4(int64_t4 p0, int64_t4 p1) { return dot(p0, p1); }
 
 // CHECK:  %dx.dot = mul i64 %0, %1
 // CHECK: ret i64 %dx.dot
-uint64_t test_dot_ulong ( uint64_t p0, uint64_t p1 ) {
-  return dot ( p0, p1 );
-}
+uint64_t test_dot_ulong(uint64_t p0, uint64_t p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i64 @llvm.dx.dot.v2i64(<2 x i64> %0, <2 x i64> %1)
 // CHECK: ret i64 %dx.dot
-uint64_t test_dot_ulong2 ( uint64_t2 p0, uint64_t2 p1 ) {
-  return dot ( p0, p1 );
-}
+uint64_t test_dot_ulong2(uint64_t2 p0, uint64_t2 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i64 @llvm.dx.dot.v3i64(<3 x i64> %0, <3 x i64> %1)
 // CHECK: ret i64 %dx.dot
-uint64_t test_dot_ulong3 ( uint64_t3 p0, uint64_t3 p1 ) {
-  return dot ( p0, p1 );
-}
+uint64_t test_dot_ulong3(uint64_t3 p0, uint64_t3 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call i64 @llvm.dx.dot.v4i64(<4 x i64> %0, <4 x i64> %1)
 // CHECK: ret i64 %dx.dot
-uint64_t test_dot_ulong4 ( uint64_t4 p0, uint64_t4 p1 ) {
-  return dot ( p0, p1 );
-}
+uint64_t test_dot_ulong4(uint64_t4 p0, uint64_t4 p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = fmul half %0, %1
 // NATIVE_HALF: ret half %dx.dot
 // NO_HALF: %dx.dot = fmul float %0, %1
 // NO_HALF: ret float %dx.dot
-half test_dot_half ( half p0, half p1 ) {
-  return dot ( p0, p1 );
-}
+half test_dot_half(half p0, half p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = call half @llvm.dx.dot.v2f16(<2 x half> %0, <2 x half> %1)
 // NATIVE_HALF: ret half %dx.dot
 // NO_HALF: %dx.dot = call float @llvm.dx.dot.v2f32(<2 x float> %0, <2 x float> %1)
 // NO_HALF: ret float %dx.dot
-half test_dot_half2 ( half2 p0, half2 p1 ) {
-  return dot ( p0, p1 );
-}
+half test_dot_half2(half2 p0, half2 p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = call half @llvm.dx.dot.v3f16(<3 x half> %0, <3 x half> %1)
 // NATIVE_HALF: ret half %dx.dot
 // NO_HALF: %dx.dot = call float @llvm.dx.dot.v3f32(<3 x float> %0, <3 x float> %1)
 // NO_HALF: ret float %dx.dot
-half test_dot_half3 ( half3 p0, half3 p1 ) {
-  return dot ( p0, p1 );
-}
+half test_dot_half3(half3 p0, half3 p1) { return dot(p0, p1); }
 
 // NATIVE_HALF: %dx.dot = call half @llvm.dx.dot.v4f16(<4 x half> %0, <4 x half> %1)
 // NATIVE_HALF: ret half %dx.dot
 // NO_HALF: %dx.dot = call float @llvm.dx.dot.v4f32(<4 x float> %0, <4 x float> %1)
 // NO_HALF: ret float %dx.dot
-half test_dot_half4 ( half4 p0, half4 p1 ) {
-  return dot ( p0, p1 );
-}
+half test_dot_half4(half4 p0, half4 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = fmul float %0, %1
 // CHECK: ret float %dx.dot
-float test_dot_float ( float p0, float p1 ) {
-  return dot ( p0, p1 );
-}
+float test_dot_float(float p0, float p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call float @llvm.dx.dot.v2f32(<2 x float> %0, <2 x float> %1)
 // CHECK: ret float %dx.dot
-float test_dot_float2 ( float2 p0, float2 p1 ) {
-  return dot ( p0, p1 );
-}
+float test_dot_float2(float2 p0, float2 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call float @llvm.dx.dot.v3f32(<3 x float> %0, <3 x float> %1)
 // CHECK: ret float %dx.dot
-float test_dot_float3 ( float3 p0, float3 p1 ) {
-  return dot ( p0, p1 );
-}
+float test_dot_float3(float3 p0, float3 p1) { return dot(p0, p1); }
 
 // CHECK: %dx.dot = call float @llvm.dx.dot.v4f32(<4 x float> %0, <4 x float> %1)
 // CHECK: ret float %dx.dot
-float test_dot_float4 ( float4 p0, float4 p1) {
-  return dot ( p0, p1 );
-}
+float test_dot_float4(float4 p0, float4 p1) { return dot(p0, p1); }
 
 // CHECK:  %dx.dot = call float @llvm.dx.dot.v2f32(<2 x float> %splat.splat, <2 x float> %1)
 // CHECK: ret float %dx.dot
-float test_dot_float2_splat ( float p0, float2 p1 ) {
-  return dot( p0, p1 );
-}
+float test_dot_float2_splat(float p0, float2 p1) { return dot(p0, p1); }
 
 // CHECK:  %dx.dot = call float @llvm.dx.dot.v3f32(<3 x float> %splat.splat, <3 x float> %1)
 // CHECK: ret float %dx.dot
-float test_dot_float3_splat ( float p0, float3 p1 ) {
-  return dot( p0, p1 );
-}
+float test_dot_float3_splat(float p0, float3 p1) { return dot(p0, p1); }
 
 // CHECK:  %dx.dot = call float @llvm.dx.dot.v4f32(<4 x float> %splat.splat, <4 x float> %1)
 // CHECK: ret float %dx.dot
-float test_dot_float4_splat ( float p0, float4 p1 ) {
-  return dot( p0, p1 );
-}
+float test_dot_float4_splat(float p0, float4 p1) { return dot(p0, p1); }
 
 // CHECK: %conv = sitofp i32 %1 to float
 // CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
 // CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK: %dx.dot = call float @llvm.dx.dot.v2f32(<2 x float> %0, <2 x float> %splat.splat)
 // CHECK: ret float %dx.dot
-float test_builtin_dot_float2_int_splat ( float2 p0, int p1 ) {
-  return dot ( p0, p1 );
+float test_builtin_dot_float2_int_splat(float2 p0, int p1) {
+  return dot(p0, p1);
 }
 
 // CHECK: %conv = sitofp i32 %1 to float
@@ -240,26 +170,24 @@ float test_builtin_dot_float2_int_splat ( float2 p0, int p1 ) {
 // CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
 // CHECK: %dx.dot = call float @llvm.dx.dot.v3f32(<3 x float> %0, <3 x float> %splat.splat)
 // CHECK: ret float %dx.dot
-float test_builtin_dot_float3_int_splat ( float3 p0, int p1 ) {
-  return dot ( p0, p1 );
+float test_builtin_dot_float3_int_splat(float3 p0, int p1) {
+  return dot(p0, p1);
 }
 
 // CHECK: %dx.dot = fmul double %0, %1
 // CHECK: ret double %dx.dot
-double test_dot_double ( double p0, double p1 ) {
-  return dot ( p0, p1 );
-}
+double test_dot_double(double p0, double p1) { return dot(p0, p1); }
 
 // CHECK: %conv = zext i1 %tobool to i32
 // CHECK: %dx.dot = mul i32 %conv, %1
 // CHECK: ret i32 %dx.dot
-int test_dot_bool_scalar_arg0_type_promotion ( bool p0, int p1 ) {
-  return dot ( p0, p1 );
+int test_dot_bool_scalar_arg0_type_promotion(bool p0, int p1) {
+  return dot(p0, p1);
 }
 
 // CHECK: %conv = zext i1 %tobool to i32
 // CHECK: %dx.dot = mul i32 %0, %conv
 // CHECK: ret i32 %dx.dot
-int test_dot_bool_scalar_arg1_type_promotion ( int p0, bool p1 ) {
-  return dot ( p0, p1 );
+int test_dot_bool_scalar_arg1_type_promotion(int p0, bool p1) {
+  return dot(p0, p1);
 }
diff --git a/clang/test/CodeGenHLSL/builtins/frac.hlsl b/clang/test/CodeGenHLSL/builtins/frac.hlsl
new file mode 100644
index 00000000000000..7c4d1468e96d27
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/frac.hlsl
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: %dx.frac = call half @llvm.dx.frac.f16(
+// NATIVE_HALF: ret half %dx.frac
+// NO_HALF: define noundef float @"?test_frac_half@@YA$halff@$halff@@Z"(
+// NO_HALF: %dx.frac = call float @llvm.dx.frac.f32(
+// NO_HALF: ret float %dx.frac
+half test_frac_half(half p0) { return frac(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: %dx.frac = call <2 x half> @llvm.dx.frac.v2f16
+// NATIVE_HALF: ret <2 x half> %dx.frac
+// NO_HALF: define noundef <2 x float> @
+// NO_HALF: %dx.frac = call <2 x float> @llvm.dx.frac.v2f32(
+// NO_HALF: ret <2 x float> %dx.frac
+half2 test_frac_half2(half2 p0) { return frac(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: %dx.frac = call <3 x half> @llvm.dx.frac.v3f16
+// NATIVE_HALF: ret <3 x half> %dx.frac
+// NO_HALF: define noundef <3 x float> @
+// NO_HALF: %dx.frac = call <3 x float> @llvm.dx.frac.v3f32(
+// NO_HALF: ret <3 x float> %dx.frac
+half3 test_frac_half3(half3 p0) { return frac(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: %dx.frac = call <4 x half> @llvm.dx.frac.v4f16
+// NATIVE_HALF: ret <4 x half> %dx.frac
+// NO_HALF: define noundef <4 x float> @
+// NO_HALF: %dx.frac = call <4 x float> @llvm.dx.frac.v4f32(
+// NO_HALF: ret <4 x float> %dx.frac
+half4 test_frac_half4(half4 p0) { return frac(p0); }
+
+// CHECK: define noundef float @
+// CHECK: %dx.frac = call float @llvm.dx.frac.f32(
+// CHECK: ret float %dx.frac
+float test_frac_float(float p0) { return frac(p0); }
+// CHECK: define noundef <2 x float> @
+// CHECK: %dx.frac = call <2 x float> @llvm.dx.frac.v2f32
+// CHECK: ret <2 x float> %dx.frac
+float2 test_frac_float2(float2 p0) { return frac(p0); }
+// CHECK: define noundef <3 x float> @
+// CHECK: %dx.frac = call <3 x float> @llvm.dx.frac.v3f32
+// CHECK: ret <3 x float> %dx.frac
+float3 test_frac_float3(float3 p0) { return frac(p0); }
+// CHECK: define noundef <4 x float> @
+// CHECK: %dx.frac = call <4 x float> @llvm.dx.frac.v4f32
+// CHECK: ret <4 x float> %dx.frac
+float4 test_frac_float4(float4 p0) { return frac(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
index 1297f6b85bbd48..a6b3d9643d674c 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
@@ -14,85 +14,63 @@
 // NO_HALF: %4 = fmul float %2, %3
 // NO_HALF: %dx.lerp = fadd float %0, %4
 // NO_HALF: ret float %dx.lerp
-half test_lerp_half ( half p0) {
-  return lerp ( p0, p0, p0 );
-}
+half test_lerp_half(half p0) { return lerp(p0, p0, p0); }
 
 // NATIVE_HALF: %dx.lerp = call <2 x half> @llvm.dx.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
 // NATIVE_HALF: ret <2 x half> %dx.lerp
 // NO_HALF: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
 // NO_HALF: ret <2 x float> %dx.lerp
-half2  test_lerp_half2 ( half2 p0, half2 p1 ) {
-  return lerp ( p0, p0, p0 );
-}
+half2 test_lerp_half2(half2 p0, half2 p1) { return lerp(p0, p0, p0); }
 
 // NATIVE_HALF: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
 // NATIVE_HALF: ret <3 x half> %dx.lerp
 // NO_HALF: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
 // NO_HALF: ret <3 x float> %dx.lerp
-half3  test_lerp_half3 ( half3 p0, half3 p1 ) {
-  return lerp ( p0, p0, p0 );
-}
+half3 test_lerp_half3(half3 p0, half3 p1) { return lerp(p0, p0, p0); }
 
 // NATIVE_HALF: %dx.lerp = call <4 x half> @llvm.dx.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
 // NATIVE_HALF: ret <4 x half> %dx.lerp
 // NO_HALF: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
 // NO_HALF: ret <4 x float> %dx.lerp
-half4  test_lerp_half4 ( half4 p0, half4 p1 ) {
-  return lerp ( p0, p0, p0 );
-}
+half4 test_lerp_half4(half4 p0, half4 p1) { return lerp(p0, p0, p0); }
 
 // CHECK: %3 = fsub float %1, %0
 // CHECK: %4 = fmul float %2, %3
 // CHECK: %dx.lerp = fadd float %0, %4
 // CHECK: ret float %dx.lerp
-float  test_lerp_float ( float p0, float p1 ) {
-  return lerp ( p0, p0, p0 );
-}
+float test_lerp_float(float p0, float p1) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
 // CHECK: ret <2 x float> %dx.lerp
-float2  test_lerp_float2 ( float2 p0, float2 p1 ) {
-  return lerp ( p0, p0, p0 );
-}
+float2 test_lerp_float2(float2 p0, float2 p1) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
 // CHECK: ret <3 x float> %dx.lerp
-float3  test_lerp_float3 ( float3 p0, float3 p1 ) {
-  return lerp ( p0, p0, p0 );
-}
+float3 test_lerp_float3(float3 p0, float3 p1) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
 // CHECK: ret <4 x float> %dx.lerp
-float4  test_lerp_float4 ( float4 p0, float4 p1) {
-  return lerp ( p0, p0, p0 );
-}
+float4 test_lerp_float4(float4 p0, float4 p1) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
 // CHECK: ret <2 x float> %dx.lerp
-float2  test_lerp_float2_splat ( float p0, float2 p1 ) {
-  return lerp( p0, p1, p1 );
-}
+float2 test_lerp_float2_splat(float p0, float2 p1) { return lerp(p0, p1, p1); }
 
 // CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
 // CHECK: ret <3 x float> %dx.lerp
-float3  test_lerp_float3_splat ( float p0, float3 p1 ) {
-  return lerp( p0, p1, p1 );
-}
+float3 test_lerp_float3_splat(float p0, float3 p1) { return lerp(p0, p1, p1); }
 
 // CHECK:  %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
 // CHECK:  ret <4 x float> %dx.lerp
-float4  test_lerp_float4_splat ( float p0, float4 p1 ) {
-  return lerp( p0, p1, p1 );
-}
+float4 test_lerp_float4_splat(float p0, float4 p1) { return lerp(p0, p1, p1); }
 
 // CHECK: %conv = sitofp i32 %2 to float
 // CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
 // CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
 // CHECK: ret <2 x float> %dx.lerp
-float2 test_lerp_float2_int_splat ( float2 p0, int p1 ) {
-  return lerp ( p0, p0, p1 );
+float2 test_lerp_float2_int_splat(float2 p0, int p1) {
+  return lerp(p0, p0, p1);
 }
 
 // CHECK: %conv = sitofp i32 %2 to float
@@ -100,6 +78,6 @@ float2 test_lerp_float2_int_splat ( float2 p0, int p1 ) {
 // CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
 // CHECK:  %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
 // CHECK: ret <3 x float> %dx.lerp
-float3 test_lerp_float3_int_splat ( float3 p0, int p1 ) {
-  return lerp ( p0, p0, p1 );
+float3 test_lerp_float3_int_splat(float3 p0, int p1) {
+  return lerp(p0, p0, p1);
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
index 5dbb52a80c6bd0..8de8f86d7eb260 100644
--- a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
@@ -1,109 +1,110 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
 
-float test_no_second_arg ( float2 p0) {
-  return __builtin_hlsl_dot ( p0 );
+float test_no_second_arg(float2 p0) {
+  return __builtin_hlsl_dot(p0);
   // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
 }
 
-float test_too_many_arg ( float2 p0) {
-  return __builtin_hlsl_dot ( p0, p0, p0 );
+float test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_dot(p0, p0, p0);
   // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
 }
 
-float test_dot_no_second_arg ( float2 p0) {
-  return dot ( p0 );
+float test_dot_no_second_arg(float2 p0) {
+  return dot(p0);
   // expected-error@-1 {{no matching function for call to 'dot'}}
 }
 
-float test_dot_vector_size_mismatch ( float3 p0, float2 p1 ) {
-  return dot ( p0, p1 );
+float test_dot_vector_size_mismatch(float3 p0, float2 p1) {
+  return dot(p0, p1);
   // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
 }
 
-float test_dot_builtin_vector_size_mismatch ( float3 p0, float2 p1 ) {
-  return __builtin_hlsl_dot ( p0, p1 );
+float test_dot_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
-float test_dot_scalar_mismatch ( float p0, int p1 ) {
-  return dot ( p0, p1 );
+float test_dot_scalar_mismatch(float p0, int p1) {
+  return dot(p0, p1);
   // expected-error@-1 {{call to 'dot' is ambiguous}}
 }
 
-float test_dot_element_type_mismatch ( int2 p0, float2 p1 ) {
-  return dot ( p0, p1 );
+float test_dot_element_type_mismatch(int2 p0, float2 p1) {
+  return dot(p0, p1);
   // expected-error@-1 {{call to 'dot' is ambiguous}}
 }
 
 //NOTE: for all the *_promotion we are intentionally not handling type promotion in builtins
-float test_builtin_dot_vec_int_to_float_promotion ( int2 p0, float2 p1 ) {
-  return __builtin_hlsl_dot ( p0, p1 );
+float test_builtin_dot_vec_int_to_float_promotion(int2 p0, float2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
-int64_t test_builtin_dot_vec_int_to_int64_promotion( int64_t2 p0, int2 p1 ) {
-  return __builtin_hlsl_dot( p0, p1 );
+int64_t test_builtin_dot_vec_int_to_int64_promotion(int64_t2 p0, int2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
-float test_builtin_dot_vec_half_to_float_promotion( float2 p0, half2 p1 ) {
-  return __builtin_hlsl_dot( p0, p1 );
+float test_builtin_dot_vec_half_to_float_promotion(float2 p0, half2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
 #ifdef __HLSL_ENABLE_16_BIT
-float test_builtin_dot_vec_int16_to_float_promotion( float2 p0, int16_t2 p1 ) {
-  return __builtin_hlsl_dot( p0, p1 );
+float test_builtin_dot_vec_int16_to_float_promotion(float2 p0, int16_t2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
-half test_builtin_dot_vec_int16_to_half_promotion( half2 p0, int16_t2 p1 ) {
-  return __builtin_hlsl_dot( p0, p1 );
+half test_builtin_dot_vec_int16_to_half_promotion(half2 p0, int16_t2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
-int test_builtin_dot_vec_int16_to_int_promotion( int2 p0, int16_t2 p1 ) {
-  return __builtin_hlsl_dot( p0, p1 );
+int test_builtin_dot_vec_int16_to_int_promotion(int2 p0, int16_t2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 
-int64_t test_builtin_dot_vec_int16_to_int64_promotion( int64_t2 p0, int16_t2 p1 ) {
-  return __builtin_hlsl_dot( p0, p1 );
+int64_t test_builtin_dot_vec_int16_to_int64_promotion(int64_t2 p0,
+                                                      int16_t2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have vectors of the same type}}
 }
 #endif
 
-float test_builtin_dot_float2_splat ( float p0, float2 p1 ) {
-  return __builtin_hlsl_dot( p0, p1 );
+float test_builtin_dot_float2_splat(float p0, float2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
-float test_builtin_dot_float3_splat ( float p0, float3 p1 ) {
-  return __builtin_hlsl_dot( p0, p1 );
+float test_builtin_dot_float3_splat(float p0, float3 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
-float test_builtin_dot_float4_splat ( float p0, float4 p1 ) {
-  return __builtin_hlsl_dot( p0, p1 );
+float test_builtin_dot_float4_splat(float p0, float4 p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
-float test_dot_float2_int_splat ( float2 p0, int p1 ) {
-  return __builtin_hlsl_dot ( p0, p1 );
+float test_dot_float2_int_splat(float2 p0, int p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
-float test_dot_float3_int_splat ( float3 p0, int p1 ) {
-  return __builtin_hlsl_dot ( p0, p1 );
+float test_dot_float3_int_splat(float3 p0, int p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
-float test_builtin_dot_int_vect_to_float_vec_promotion ( int2 p0, float p1 ) {
-  return __builtin_hlsl_dot ( p0, p1 );
+float test_builtin_dot_int_vect_to_float_vec_promotion(int2 p0, float p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
 }
 
-int test_builtin_dot_bool_type_promotion ( bool p0, bool p1 ) {
-  return __builtin_hlsl_dot ( p0, p1 );
+int test_builtin_dot_bool_type_promotion(bool p0, bool p1) {
+  return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
new file mode 100644
index 00000000000000..06dbdf0a68dfc1
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
@@ -0,0 +1,27 @@
+
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float test_too_few_arg() {
+  return __builtin_hlsl_elementwise_frac();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_frac(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+float builtin_bool_to_float_type_promotion(bool p1) {
+  return __builtin_hlsl_elementwise_frac(p1);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+}
+
+float builtin_frac_int_to_float_promotion(int p1) {
+  return __builtin_hlsl_elementwise_frac(p1);
+  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+}
+
+float2 builtin_frac_int2_to_float2_promotion(int2 p1) {
+  return __builtin_hlsl_elementwise_frac(p1);
+  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
index c062aa60a8df45..4ec5a4cdd26a30 100644
--- a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
@@ -1,91 +1,96 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
 
-float2 test_no_second_arg ( float2 p0) {
-  return __builtin_hlsl_lerp ( p0 );
+float2 test_no_second_arg(float2 p0) {
+  return __builtin_hlsl_lerp(p0);
   // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
 }
 
-float2 test_no_third_arg ( float2 p0) {
-  return __builtin_hlsl_lerp ( p0, p0 );
+float2 test_no_third_arg(float2 p0) {
+  return __builtin_hlsl_lerp(p0, p0);
   // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
 }
 
-float2 test_too_many_arg ( float2 p0) {
-  return __builtin_hlsl_lerp ( p0, p0, p0, p0 );
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_lerp(p0, p0, p0, p0);
   // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
 }
 
-float2 test_lerp_no_second_arg ( float2 p0) {
-  return lerp ( p0 );
+float2 test_lerp_no_second_arg(float2 p0) {
+  return lerp(p0);
   // expected-error@-1 {{no matching function for call to 'lerp'}}
 }
 
-float2 test_lerp_vector_size_mismatch ( float3 p0, float2 p1 ) {
-  return lerp ( p0, p0, p1 );
+float2 test_lerp_vector_size_mismatch(float3 p0, float2 p1) {
+  return lerp(p0, p0, p1);
   // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
 }
 
-float test_lerp_builtin_vector_size_mismatch ( float3 p0, float2 p1 ) {
-  return __builtin_hlsl_lerp ( p0, p1, p1 );
+float2 test_lerp_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must have vectors of the same type}}
 }
 
-float test_lerp_scalar_mismatch ( float p0, half p1 ) {
-  return lerp ( p1, p0, p1 );
+float test_lerp_scalar_mismatch(float p0, half p1) {
+  return lerp(p1, p0, p1);
   // expected-error@-1 {{call to 'lerp' is ambiguous}}
 }
 
-float test_lerp_element_type_mismatch ( half2 p0, float2 p1 ) {
-  return lerp ( p1, p0, p1 );
+float2 test_lerp_element_type_mismatch(half2 p0, float2 p1) {
+  return lerp(p1, p0, p1);
   // expected-error@-1 {{call to 'lerp' is ambiguous}}
 }
 
-float test_builtin_lerp_float2_splat ( float p0, float2 p1 ) {
-  return __builtin_hlsl_lerp( p0, p1, p1 );
+float2 test_builtin_lerp_float2_splat(float p0, float2 p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
 }
 
-float test_builtin_lerp_float3_splat ( float p0, float3 p1 ) {
-  return  __builtin_hlsl_lerp( p0, p1, p1 );
+float3 test_builtin_lerp_float3_splat(float p0, float3 p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
 }
 
-float test_builtin_lerp_float4_splat ( float p0, float4 p1 ) {
-  return  __builtin_hlsl_lerp( p0, p1, p1 );
+float4 test_builtin_lerp_float4_splat(float p0, float4 p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
 }
 
-float test_lerp_float2_int_splat ( float2 p0, int p1 ) {
-  return  __builtin_hlsl_lerp( p0, p1, p1 );
+float2 test_lerp_float2_int_splat(float2 p0, int p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
 }
 
-float test_lerp_float3_int_splat ( float3 p0, int p1 ) {
-  return  __builtin_hlsl_lerp( p0, p1, p1 );
+float3 test_lerp_float3_int_splat(float3 p0, int p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
 }
 
-float test_builtin_lerp_int_vect_to_float_vec_promotion ( int2 p0, float p1 ) {
-  return  __builtin_hlsl_lerp( p0, p1, p1 );
+float2 test_builtin_lerp_int_vect_to_float_vec_promotion(int2 p0, float p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
 }
 
-int test_builtin_lerp_bool_type_promotion (bool p0) {
-  return  __builtin_hlsl_lerp( p0, p0, p0 );
+float test_builtin_lerp_bool_type_promotion(bool p0) {
+  return __builtin_hlsl_lerp(p0, p0, p0);
   // expected-error@-1 {{1st argument must be a floating point type (was 'bool')}}
 }
 
-float builtin_bool_to_float_type_promotion ( float p0, bool p1 ) {
-  return __builtin_hlsl_lerp ( p0, p0, p1 );
-   // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
+float builtin_bool_to_float_type_promotion(float p0, bool p1) {
+  return __builtin_hlsl_lerp(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
 }
 
-float builtin_bool_to_float_type_promotion2 ( bool p0, float p1 ) {
-  return __builtin_hlsl_lerp ( p1, p0, p1 );
+float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
+  return __builtin_hlsl_lerp(p1, p0, p1);
   // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
 }
 
-float builtin_lerp_int_to_float_promotion ( float p0, int p1 ) {
-  return __builtin_hlsl_lerp ( p0, p0, p1 );
+float builtin_lerp_int_to_float_promotion(float p0, int p1) {
+  return __builtin_hlsl_lerp(p0, p0, p1);
   // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
+}
+
+float4 test_lerp_int4(int4 p0, int4 p1, int4 p2) {
+  return __builtin_hlsl_lerp(p0, p1, p2);
+   // expected-error@-1 {{1st argument must be a floating point type (was 'int4' (aka 'vector<int, 4>'))}}
 }
\ No newline at end of file
diff --git a/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl b/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl
index 8464f1c1a7c2cd..c13cb299127aac 100644
--- a/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl
+++ b/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl
@@ -7,73 +7,67 @@
 void Fn3(double2 D);
 void Fn3(float2 F);
 
-void Call3(half2 H) {
-  Fn3(H);
-}
+void Call3(half2 H) { Fn3(H); }
 
 void Fn5(double2 D);
 
-void Call5(half2 H) {
-  Fn5(H);
-}
+void Call5(half2 H) { Fn5(H); }
 
 void Fn4(int64_t2 L);
 void Fn4(int2 I);
 
-void Call4(int16_t H) {
-  Fn4(H);
-}
+void Call4(int16_t H) { Fn4(H); }
 
-int test_builtin_dot_bool_type_promotion ( bool p0, bool p1 ) {
-  return dot ( p0, p1 );
+int test_builtin_dot_bool_type_promotion(bool p0, bool p1) {
+  return dot(p0, p1);
 }
 
-float test_dot_scalar_mismatch ( float p0, int p1 ) {
-  return dot ( p0, p1 );
-}
+float test_dot_scalar_mismatch(float p0, int p1) { return dot(p0, p1); }
 
-float test_dot_element_type_mismatch ( int2 p0, float2 p1 ) {
-  return dot ( p0, p1 );
-}
+float test_dot_element_type_mismatch(int2 p0, float2 p1) { return dot(p0, p1); }
 
-float test_builtin_dot_vec_int_to_float_promotion ( int2 p0, float2 p1 ) {
-  return dot ( p0, p1 );
+float test_builtin_dot_vec_int_to_float_promotion(int2 p0, float2 p1) {
+  return dot(p0, p1);
 }
 
-int64_t test_builtin_dot_vec_int_to_int64_promotion( int64_t2 p0, int2 p1 ) {
-  return dot ( p0, p1 );
+int64_t test_builtin_dot_vec_int_to_int64_promotion(int64_t2 p0, int2 p1) {
+  return dot(p0, p1);
 }
 
-float test_builtin_dot_vec_half_to_float_promotion( float2 p0, half2 p1 ) {
-  return dot( p0, p1 );
+float test_builtin_dot_vec_half_to_float_promotion(float2 p0, half2 p1) {
+  return dot(p0, p1);
 }
 
-float test_builtin_dot_vec_int16_to_float_promotion( float2 p0, int16_t2 p1 ) {
-  return dot( p0, p1 );
+float test_builtin_dot_vec_int16_to_float_promotion(float2 p0, int16_t2 p1) {
+  return dot(p0, p1);
 }
 
-half test_builtin_dot_vec_int16_to_half_promotion( half2 p0, int16_t2 p1 ) {
-  return dot( p0, p1 );
+half test_builtin_dot_vec_int16_to_half_promotion(half2 p0, int16_t2 p1) {
+  return dot(p0, p1);
 }
 
-int test_builtin_dot_vec_int16_to_int_promotion( int2 p0, int16_t2 p1 ) {
-  return dot( p0, p1 );
+int test_builtin_dot_vec_int16_to_int_promotion(int2 p0, int16_t2 p1) {
+  return dot(p0, p1);
 }
 
-int64_t test_builtin_dot_vec_int16_to_int64_promotion( int64_t2 p0, int16_t2 p1 ) {
-  return dot( p0, p1 );
+int64_t test_builtin_dot_vec_int16_to_int64_promotion(int64_t2 p0,
+                                                      int16_t2 p1) {
+  return dot(p0, p1);
 }
 
+float4 test_frac_int4(int4 p0) { return frac(p0); }
+
+float test_frac_int(int p0) { return frac(p0); }
+
+float test_frac_bool(bool p0) { return frac(p0); }
+
 // https://github.com/llvm/llvm-project/issues/81049
 
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.2-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
 
-half sqrt_h(half x)
-{
-  return sqrt(x);
-}
+half sqrt_h(half x) { return sqrt(x); }
 
 // NO_HALF: define noundef float @"?sqrt_h@@YA$halff@$halff@@Z"(
 // NO_HALF: call float @llvm.sqrt.f32(float %0)
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 48332b917693a0..b44d1c6d3d2f06 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -25,8 +25,10 @@ def int_dx_dot :
     [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
     [IntrNoMem, IntrWillReturn, Commutative] >;
 
+def int_dx_frac  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+
 def int_dx_lerp :
-    Intrinsic<[LLVMMatchType<0>], 
-    [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrWillReturn, Commutative] >;
+    Intrinsic<[LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+    [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>,LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+    [IntrNoMem, IntrWillReturn] >;
 }

From c9b1f1c337ce7c12664b670c0ba1022e5d5b8001 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com>
Date: Thu, 29 Feb 2024 13:41:40 -0500
Subject: [PATCH 51/55] [HLSL]  standardize builtin unit tests (#83340)

This PR brings best practices mentioned to me on other prs and adds them
to the existing builtin tests.

Note to reviewers: I put this up in two commits because the clang-format
changes is making it hard to tell what actually changed.
use the first commit to check for correctness.
---
 clang/test/CodeGenHLSL/builtins/abs.hlsl      | 134 ++++-------
 clang/test/CodeGenHLSL/builtins/ceil.hlsl     |  69 ++----
 clang/test/CodeGenHLSL/builtins/cos.hlsl      |  69 +++---
 clang/test/CodeGenHLSL/builtins/floor.hlsl    |  69 ++----
 clang/test/CodeGenHLSL/builtins/log.hlsl      |  67 +++---
 clang/test/CodeGenHLSL/builtins/log10.hlsl    |  69 +++---
 clang/test/CodeGenHLSL/builtins/log2.hlsl     |  69 +++---
 clang/test/CodeGenHLSL/builtins/max.hlsl      | 214 ++++++------------
 clang/test/CodeGenHLSL/builtins/min.hlsl      | 207 ++++++-----------
 clang/test/CodeGenHLSL/builtins/pow.hlsl      |  97 +++-----
 clang/test/CodeGenHLSL/builtins/sin.hlsl      |  59 ++---
 clang/test/CodeGenHLSL/builtins/trunc.hlsl    |  75 +++---
 .../SemaHLSL/VectorOverloadResolution.hlsl    |   2 +-
 13 files changed, 421 insertions(+), 779 deletions(-)

diff --git a/clang/test/CodeGenHLSL/builtins/abs.hlsl b/clang/test/CodeGenHLSL/builtins/abs.hlsl
index 54c9d1a9dded45..ad65cab2721a2b 100644
--- a/clang/test/CodeGenHLSL/builtins/abs.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/abs.hlsl
@@ -1,141 +1,93 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 using hlsl::abs;
 
 #ifdef __HLSL_ENABLE_16_BIT
-// CHECK: define noundef i16 @
-// CHECK: call i16 @llvm.abs.i16(
-int16_t test_abs_int16_t ( int16_t p0 ) {
-  return abs ( p0 );
-}
-// CHECK: define noundef <2 x i16> @
-// CHECK: call <2 x i16> @llvm.abs.v2i16(
-int16_t2 test_abs_int16_t2 ( int16_t2 p0 ) {
-  return abs ( p0 );
-}
-// CHECK: define noundef <3 x i16> @
-// CHECK: call <3 x i16> @llvm.abs.v3i16(
-int16_t3 test_abs_int16_t3 ( int16_t3 p0 ) {
-  return abs ( p0 );
-}
-// CHECK: define noundef <4 x i16> @
-// CHECK: call <4 x i16> @llvm.abs.v4i16(
-int16_t4 test_abs_int16_t4 ( int16_t4 p0 ) {
-  return abs ( p0 );
-}
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.abs.i16(
+int16_t test_abs_int16_t(int16_t p0) { return abs(p0); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.abs.v2i16(
+int16_t2 test_abs_int16_t2(int16_t2 p0) { return abs(p0); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.abs.v3i16(
+int16_t3 test_abs_int16_t3(int16_t3 p0) { return abs(p0); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.abs.v4i16(
+int16_t4 test_abs_int16_t4(int16_t4 p0) { return abs(p0); }
 #endif // __HLSL_ENABLE_16_BIT
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.fabs.f16(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.fabs.f16(
 // NO_HALF: define noundef float @"?test_abs_half@@YA$halff@$halff@@Z"(
 // NO_HALF: call float @llvm.fabs.f32(float %0)
-half test_abs_half ( half p0 ) {
-  return abs ( p0 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.fabs.v2f16(
+half test_abs_half(half p0) { return abs(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.fabs.v2f16(
 // NO_HALF: define noundef <2 x float> @"?test_abs_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"(
 // NO_HALF: call <2 x float> @llvm.fabs.v2f32(
-half2 test_abs_half2 ( half2 p0 ) {
-  return abs ( p0 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.fabs.v3f16(
+half2 test_abs_half2(half2 p0) { return abs(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.fabs.v3f16(
 // NO_HALF: define noundef <3 x float> @"?test_abs_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"(
 // NO_HALF: call <3 x float> @llvm.fabs.v3f32(
-half3 test_abs_half3 ( half3 p0 ) {
-  return abs ( p0 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.fabs.v4f16(
+half3 test_abs_half3(half3 p0) { return abs(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.fabs.v4f16(
 // NO_HALF: define noundef <4 x float> @"?test_abs_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"(
 // NO_HALF: call <4 x float> @llvm.fabs.v4f32(
-half4 test_abs_half4 ( half4 p0 ) {
-  return abs ( p0 );
-}
+half4 test_abs_half4(half4 p0) { return abs(p0); }
 // CHECK: define noundef i32 @
 // CHECK: call i32 @llvm.abs.i32(
-// NO_HALF: define noundef i32 @"?test_abs_int@@YAHH@Z"
-int test_abs_int ( int p0 ) {
-  return abs ( p0 );
-}
+int test_abs_int(int p0) { return abs(p0); }
 // CHECK: define noundef <2 x i32> @
 // CHECK: call <2 x i32> @llvm.abs.v2i32(
-int2 test_abs_int2 ( int2 p0 ) {
-  return abs ( p0 );
-}
+int2 test_abs_int2(int2 p0) { return abs(p0); }
 // CHECK: define noundef <3 x i32> @
 // CHECK: call <3 x i32> @llvm.abs.v3i32(
-int3 test_abs_int3 ( int3 p0 ) {
-  return abs ( p0 );
-}
+int3 test_abs_int3(int3 p0) { return abs(p0); }
 // CHECK: define noundef <4 x i32> @
 // CHECK: call <4 x i32> @llvm.abs.v4i32(
-int4 test_abs_int4 ( int4 p0 ) {
-  return abs ( p0 );
-}
+int4 test_abs_int4(int4 p0) { return abs(p0); }
 // CHECK: define noundef float @
 // CHECK: call float @llvm.fabs.f32(
-float test_abs_float ( float p0 ) {
-  return abs ( p0 );
-}
+float test_abs_float(float p0) { return abs(p0); }
 // CHECK: define noundef <2 x float> @
 // CHECK: call <2 x float> @llvm.fabs.v2f32(
-float2 test_abs_float2 ( float2 p0 ) {
-  return abs ( p0 );
-}
+float2 test_abs_float2(float2 p0) { return abs(p0); }
 // CHECK: define noundef <3 x float> @
 // CHECK: call <3 x float> @llvm.fabs.v3f32(
-float3 test_abs_float3 ( float3 p0 ) {
-  return abs ( p0 );
-}
+float3 test_abs_float3(float3 p0) { return abs(p0); }
 // CHECK: define noundef <4 x float> @
 // CHECK: call <4 x float> @llvm.fabs.v4f32(
-float4 test_abs_float4 ( float4 p0 ) {
-  return abs ( p0 );
-}
+float4 test_abs_float4(float4 p0) { return abs(p0); }
 // CHECK: define noundef i64 @
 // CHECK: call i64 @llvm.abs.i64(
-int64_t test_abs_int64_t ( int64_t p0 ) {
-  return abs ( p0 );
-}
+int64_t test_abs_int64_t(int64_t p0) { return abs(p0); }
 // CHECK: define noundef <2 x i64> @
 // CHECK: call <2 x i64> @llvm.abs.v2i64(
-int64_t2 test_abs_int64_t2 ( int64_t2 p0 ) {
-  return abs ( p0 );
-}
+int64_t2 test_abs_int64_t2(int64_t2 p0) { return abs(p0); }
 // CHECK: define noundef <3 x i64> @
 // CHECK: call <3 x i64> @llvm.abs.v3i64(
-int64_t3 test_abs_int64_t3 ( int64_t3 p0 ) {
-  return abs ( p0 );
-}
+int64_t3 test_abs_int64_t3(int64_t3 p0) { return abs(p0); }
 // CHECK: define noundef <4 x i64> @
 // CHECK: call <4 x i64> @llvm.abs.v4i64(
-int64_t4 test_abs_int64_t4 ( int64_t4 p0 ) {
-  return abs ( p0 );
-}
+int64_t4 test_abs_int64_t4(int64_t4 p0) { return abs(p0); }
 // CHECK: define noundef double @
 // CHECK: call double @llvm.fabs.f64(
-double test_abs_double ( double p0 ) {
-  return abs ( p0 );
-}
+double test_abs_double(double p0) { return abs(p0); }
 // CHECK: define noundef <2 x double> @
 // CHECK: call <2 x double> @llvm.fabs.v2f64(
-double2 test_abs_double2 ( double2 p0 ) {
-  return abs ( p0 );
-}
+double2 test_abs_double2(double2 p0) { return abs(p0); }
 // CHECK: define noundef <3 x double> @
 // CHECK: call <3 x double> @llvm.fabs.v3f64(
-double3 test_abs_double3 ( double3 p0 ) {
-  return abs ( p0 );
-}
+double3 test_abs_double3(double3 p0) { return abs(p0); }
 // CHECK: define noundef <4 x double> @
 // CHECK: call <4 x double> @llvm.fabs.v4f64(
-double4 test_abs_double4 ( double4 p0 ) {
-  return abs ( p0 );
-}
+double4 test_abs_double4(double4 p0) { return abs(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/ceil.hlsl b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
index f1672816e72bc2..06d0d4c2cf546d 100644
--- a/clang/test/CodeGenHLSL/builtins/ceil.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
@@ -1,79 +1,56 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 using hlsl::ceil;
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.ceil.f16(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.ceil.f16(
 // NO_HALF: define noundef float @"?test_ceil_half@@YA$halff@$halff@@Z"(
 // NO_HALF: call float @llvm.ceil.f32(float %0)
-half test_ceil_half ( half p0 ) {
-  return ceil ( p0 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.ceil.v2f16(
+half test_ceil_half(half p0) { return ceil(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.ceil.v2f16(
 // NO_HALF: define noundef <2 x float> @"?test_ceil_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"(
 // NO_HALF: call <2 x float> @llvm.ceil.v2f32(
-half2 test_ceil_half2 ( half2 p0 ) {
-  return ceil ( p0 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.ceil.v3f16(
+half2 test_ceil_half2(half2 p0) { return ceil(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.ceil.v3f16(
 // NO_HALF: define noundef <3 x float> @"?test_ceil_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"(
 // NO_HALF: call <3 x float> @llvm.ceil.v3f32(
-half3 test_ceil_half3 ( half3 p0 ) {
-  return ceil ( p0 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.ceil.v4f16(
+half3 test_ceil_half3(half3 p0) { return ceil(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.ceil.v4f16(
 // NO_HALF: define noundef <4 x float> @"?test_ceil_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"(
 // NO_HALF: call <4 x float> @llvm.ceil.v4f32(
-half4 test_ceil_half4 ( half4 p0 ) {
-  return ceil ( p0 );
-}
+half4 test_ceil_half4(half4 p0) { return ceil(p0); }
 
 // CHECK: define noundef float @
 // CHECK: call float @llvm.ceil.f32(
-float test_ceil_float ( float p0 ) {
-  return ceil ( p0 );
-}
+float test_ceil_float(float p0) { return ceil(p0); }
 // CHECK: define noundef <2 x float> @
 // CHECK: call <2 x float> @llvm.ceil.v2f32(
-float2 test_ceil_float2 ( float2 p0 ) {
-  return ceil ( p0 );
-}
+float2 test_ceil_float2(float2 p0) { return ceil(p0); }
 // CHECK: define noundef <3 x float> @
 // CHECK: call <3 x float> @llvm.ceil.v3f32(
-float3 test_ceil_float3 ( float3 p0 ) {
-  return ceil ( p0 );
-}
+float3 test_ceil_float3(float3 p0) { return ceil(p0); }
 // CHECK: define noundef <4 x float> @
 // CHECK: call <4 x float> @llvm.ceil.v4f32(
-float4 test_ceil_float4 ( float4 p0 ) {
-  return ceil ( p0 );
-}
+float4 test_ceil_float4(float4 p0) { return ceil(p0); }
 
 // CHECK: define noundef double @
 // CHECK: call double @llvm.ceil.f64(
-double test_ceil_double ( double p0 ) {
-  return ceil ( p0 );
-}
+double test_ceil_double(double p0) { return ceil(p0); }
 // CHECK: define noundef <2 x double> @
 // CHECK: call <2 x double> @llvm.ceil.v2f64(
-double2 test_ceil_double2 ( double2 p0 ) {
-  return ceil ( p0 );
-}
+double2 test_ceil_double2(double2 p0) { return ceil(p0); }
 // CHECK: define noundef <3 x double> @
 // CHECK: call <3 x double> @llvm.ceil.v3f64(
-double3 test_ceil_double3 ( double3 p0 ) {
-  return ceil ( p0 );
-}
+double3 test_ceil_double3(double3 p0) { return ceil(p0); }
 // CHECK: define noundef <4 x double> @
 // CHECK: call <4 x double> @llvm.ceil.v4f64(
-double4 test_ceil_double4 ( double4 p0 ) {
-  return ceil ( p0 );
-}
+double4 test_ceil_double4(double4 p0) { return ceil(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl
index 2fc1571949b2c5..fb416fcaa49d76 100644
--- a/clang/test/CodeGenHLSL/builtins/cos.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl
@@ -1,56 +1,41 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.cos.f16(
-// NO_HALF: define noundef float @"?test_cos_half@@YA$halff@$halff@@Z"(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.cos.f16(
+// NO_HALF: define noundef float @"?test_cos_half
 // NO_HALF: call float @llvm.cos.f32(
-half test_cos_half ( half p0 ) {
-  return cos ( p0 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.cos.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_cos_float2@@YAT?$__vector@M$01@__clang@@T12@@Z"(
+half test_cos_half(half p0) { return cos(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.cos.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_cos_half2
 // NO_HALF: call <2 x float> @llvm.cos.v2f32(
-half2 test_cos_half2 ( half2 p0 ) {
-  return cos ( p0 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.cos.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_cos_float3@@YAT?$__vector@M$02@__clang@@T12@@Z"(
+half2 test_cos_half2(half2 p0) { return cos(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.cos.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_cos_half3
 // NO_HALF: call <3 x float> @llvm.cos.v3f32(
-half3 test_cos_half3 ( half3 p0 ) {
-  return cos ( p0 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.cos.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_cos_float4@@YAT?$__vector@M$03@__clang@@T12@@Z"(
+half3 test_cos_half3(half3 p0) { return cos(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.cos.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_cos_half4
 // NO_HALF: call <4 x float> @llvm.cos.v4f32(
-half4 test_cos_half4 ( half4 p0 ) {
-  return cos ( p0 );
-}
+half4 test_cos_half4(half4 p0) { return cos(p0); }
 
-// CHECK: define noundef float @
+// CHECK: define noundef float @"?test_cos_float
 // CHECK: call float @llvm.cos.f32(
-float test_cos_float ( float p0 ) {
-  return cos ( p0 );
-}
-// CHECK: define noundef <2 x float> @
+float test_cos_float(float p0) { return cos(p0); }
+// CHECK: define noundef <2 x float> @"?test_cos_float2
 // CHECK: call <2 x float> @llvm.cos.v2f32
-float2 test_cos_float2 ( float2 p0 ) {
-  return cos ( p0 );
-}
-// CHECK: define noundef <3 x float> @
+float2 test_cos_float2(float2 p0) { return cos(p0); }
+// CHECK: define noundef <3 x float> @"?test_cos_float3
 // CHECK: call <3 x float> @llvm.cos.v3f32
-float3 test_cos_float3 ( float3 p0 ) {
-  return cos ( p0 );
-}
-// CHECK: define noundef <4 x float> @
+float3 test_cos_float3(float3 p0) { return cos(p0); }
+// CHECK: define noundef <4 x float> @"?test_cos_float4
 // CHECK: call <4 x float> @llvm.cos.v4f32
-float4 test_cos_float4 ( float4 p0 ) {
-  return cos ( p0 );
-}
+float4 test_cos_float4(float4 p0) { return cos(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl
index 357661761b762a..d2a2f6e52f1ec3 100644
--- a/clang/test/CodeGenHLSL/builtins/floor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl
@@ -1,79 +1,56 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 using hlsl::floor;
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.floor.f16(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.floor.f16(
 // NO_HALF: define noundef float @"?test_floor_half@@YA$halff@$halff@@Z"(
 // NO_HALF: call float @llvm.floor.f32(float %0)
-half test_floor_half ( half p0 ) {
-  return floor ( p0 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.floor.v2f16(
+half test_floor_half(half p0) { return floor(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.floor.v2f16(
 // NO_HALF: define noundef <2 x float> @"?test_floor_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"(
 // NO_HALF: call <2 x float> @llvm.floor.v2f32(
-half2 test_floor_half2 ( half2 p0 ) {
-  return floor ( p0 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.floor.v3f16(
+half2 test_floor_half2(half2 p0) { return floor(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.floor.v3f16(
 // NO_HALF: define noundef <3 x float> @"?test_floor_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"(
 // NO_HALF: call <3 x float> @llvm.floor.v3f32(
-half3 test_floor_half3 ( half3 p0 ) {
-  return floor ( p0 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.floor.v4f16(
+half3 test_floor_half3(half3 p0) { return floor(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.floor.v4f16(
 // NO_HALF: define noundef <4 x float> @"?test_floor_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"(
 // NO_HALF: call <4 x float> @llvm.floor.v4f32(
-half4 test_floor_half4 ( half4 p0 ) {
-  return floor ( p0 );
-}
+half4 test_floor_half4(half4 p0) { return floor(p0); }
 
 // CHECK: define noundef float @
 // CHECK: call float @llvm.floor.f32(
-float test_floor_float ( float p0 ) {
-  return floor ( p0 );
-}
+float test_floor_float(float p0) { return floor(p0); }
 // CHECK: define noundef <2 x float> @
 // CHECK: call <2 x float> @llvm.floor.v2f32(
-float2 test_floor_float2 ( float2 p0 ) {
-  return floor ( p0 );
-}
+float2 test_floor_float2(float2 p0) { return floor(p0); }
 // CHECK: define noundef <3 x float> @
 // CHECK: call <3 x float> @llvm.floor.v3f32(
-float3 test_floor_float3 ( float3 p0 ) {
-  return floor ( p0 );
-}
+float3 test_floor_float3(float3 p0) { return floor(p0); }
 // CHECK: define noundef <4 x float> @
 // CHECK: call <4 x float> @llvm.floor.v4f32(
-float4 test_floor_float4 ( float4 p0 ) {
-  return floor ( p0 );
-}
+float4 test_floor_float4(float4 p0) { return floor(p0); }
 
 // CHECK: define noundef double @
 // CHECK: call double @llvm.floor.f64(
-double test_floor_double ( double p0 ) {
-  return floor ( p0 );
-}
+double test_floor_double(double p0) { return floor(p0); }
 // CHECK: define noundef <2 x double> @
 // CHECK: call <2 x double> @llvm.floor.v2f64(
-double2 test_floor_double2 ( double2 p0 ) {
-  return floor ( p0 );
-}
+double2 test_floor_double2(double2 p0) { return floor(p0); }
 // CHECK: define noundef <3 x double> @
 // CHECK: call <3 x double> @llvm.floor.v3f64(
-double3 test_floor_double3 ( double3 p0 ) {
-  return floor ( p0 );
-}
+double3 test_floor_double3(double3 p0) { return floor(p0); }
 // CHECK: define noundef <4 x double> @
 // CHECK: call <4 x double> @llvm.floor.v4f64(
-double4 test_floor_double4 ( double4 p0 ) {
-  return floor ( p0 );
-}
+double4 test_floor_double4(double4 p0) { return floor(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl
index 6a8e4ac2e5f294..ecbdf1e98ac346 100644
--- a/clang/test/CodeGenHLSL/builtins/log.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log.hlsl
@@ -1,56 +1,41 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.log.f16(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.log.f16(
 // NO_HALF: define noundef float @"?test_log_half@@YA$halff@$halff@@Z"(
 // NO_HALF: call float @llvm.log.f32(
-half test_log_half ( half p0 ) {
-  return log ( p0 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.log.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_log_float2@@YAT?$__vector@M$01@__clang@@T12@@Z"(
+half test_log_half(half p0) { return log(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.log.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_log_half2
 // NO_HALF: call <2 x float> @llvm.log.v2f32(
-half2 test_log_half2 ( half2 p0 ) {
-  return log ( p0 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.log.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_log_float3@@YAT?$__vector@M$02@__clang@@T12@@Z"(
+half2 test_log_half2(half2 p0) { return log(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.log.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_log_half3
 // NO_HALF: call <3 x float> @llvm.log.v3f32(
-half3 test_log_half3 ( half3 p0 ) {
-  return log ( p0 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.log.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_log_float4@@YAT?$__vector@M$03@__clang@@T12@@Z"(
+half3 test_log_half3(half3 p0) { return log(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.log.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_log_half4
 // NO_HALF: call <4 x float> @llvm.log.v4f32(
-half4 test_log_half4 ( half4 p0 ) {
-  return log ( p0 );
-}
+half4 test_log_half4(half4 p0) { return log(p0); }
 
-// CHECK: define noundef float @
+// CHECK: define noundef float @"?test_log_float
 // CHECK: call float @llvm.log.f32(
-float test_log_float ( float p0 ) {
-  return log ( p0 );
-}
-// CHECK: define noundef <2 x float> @
+float test_log_float(float p0) { return log(p0); }
+// CHECK: define noundef <2 x float> @"?test_log_float2
 // CHECK: call <2 x float> @llvm.log.v2f32
-float2 test_log_float2 ( float2 p0 ) {
-  return log ( p0 );
-}
-// CHECK: define noundef <3 x float> @
+float2 test_log_float2(float2 p0) { return log(p0); }
+// CHECK: define noundef <3 x float> @"?test_log_float3
 // CHECK: call <3 x float> @llvm.log.v3f32
-float3 test_log_float3 ( float3 p0 ) {
-  return log ( p0 );
-}
-// CHECK: define noundef <4 x float> @
+float3 test_log_float3(float3 p0) { return log(p0); }
+// CHECK: define noundef <4 x float> @"?test_log_float4
 // CHECK: call <4 x float> @llvm.log.v4f32
-float4 test_log_float4 ( float4 p0 ) {
-  return log ( p0 );
-}
+float4 test_log_float4(float4 p0) { return log(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log10.hlsl b/clang/test/CodeGenHLSL/builtins/log10.hlsl
index 8ce24fd530dd3c..638b86e8d5eaf7 100644
--- a/clang/test/CodeGenHLSL/builtins/log10.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log10.hlsl
@@ -1,56 +1,41 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.log10.f16(
-// NO_HALF: define noundef float @"?test_log10_half@@YA$halff@$halff@@Z"(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.log10.f16(
+// NO_HALF: define noundef float @"?test_log10_half
 // NO_HALF: call float @llvm.log10.f32(
-half test_log10_half ( half p0 ) {
-  return log10 ( p0 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.log10.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_log10_float2@@YAT?$__vector@M$01@__clang@@T12@@Z"(
+half test_log10_half(half p0) { return log10(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.log10.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_log10_half2
 // NO_HALF: call <2 x float> @llvm.log10.v2f32(
-half2 test_log10_half2 ( half2 p0 ) {
-  return log10 ( p0 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.log10.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_log10_float3@@YAT?$__vector@M$02@__clang@@T12@@Z"(
+half2 test_log10_half2(half2 p0) { return log10(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.log10.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_log10_half3
 // NO_HALF: call <3 x float> @llvm.log10.v3f32(
-half3 test_log10_half3 ( half3 p0 ) {
-  return log10 ( p0 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.log10.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_log10_float4@@YAT?$__vector@M$03@__clang@@T12@@Z"(
+half3 test_log10_half3(half3 p0) { return log10(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.log10.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_log10_half4
 // NO_HALF: call <4 x float> @llvm.log10.v4f32(
-half4 test_log10_half4 ( half4 p0 ) {
-  return log10 ( p0 );
-}
+half4 test_log10_half4(half4 p0) { return log10(p0); }
 
-// CHECK: define noundef float @
+// CHECK: define noundef float @"?test_log10_float
 // CHECK: call float @llvm.log10.f32(
-float test_log10_float ( float p0 ) {
-  return log10 ( p0 );
-}
-// CHECK: define noundef <2 x float> @
+float test_log10_float(float p0) { return log10(p0); }
+// CHECK: define noundef <2 x float> @"?test_log10_float2
 // CHECK: call <2 x float> @llvm.log10.v2f32
-float2 test_log10_float2 ( float2 p0 ) {
-  return log10 ( p0 );
-}
-// CHECK: define noundef <3 x float> @
+float2 test_log10_float2(float2 p0) { return log10(p0); }
+// CHECK: define noundef <3 x float> @"?test_log10_float3
 // CHECK: call <3 x float> @llvm.log10.v3f32
-float3 test_log10_float3 ( float3 p0 ) {
-  return log10 ( p0 );
-}
-// CHECK: define noundef <4 x float> @
+float3 test_log10_float3(float3 p0) { return log10(p0); }
+// CHECK: define noundef <4 x float> @"?test_log10_float4
 // CHECK: call <4 x float> @llvm.log10.v4f32
-float4 test_log10_float4 ( float4 p0 ) {
-  return log10 ( p0 );
-}
+float4 test_log10_float4(float4 p0) { return log10(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl
index f0f0a6c7c50e81..9ed8185a06b04f 100644
--- a/clang/test/CodeGenHLSL/builtins/log2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl
@@ -1,56 +1,41 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.log2.f16(
-// NO_HALF: define noundef float @"?test_log2_half@@YA$halff@$halff@@Z"(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.log2.f16(
+// NO_HALF: define noundef float @"?test_log2_half
 // NO_HALF: call float @llvm.log2.f32(
-half test_log2_half ( half p0 ) {
-  return log2 ( p0 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.log2.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_log2_float2@@YAT?$__vector@M$01@__clang@@T12@@Z"(
+half test_log2_half(half p0) { return log2(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.log2.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_log2_half2
 // NO_HALF: call <2 x float> @llvm.log2.v2f32(
-half2 test_log2_half2 ( half2 p0 ) {
-  return log2 ( p0 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.log2.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_log2_float3@@YAT?$__vector@M$02@__clang@@T12@@Z"(
+half2 test_log2_half2(half2 p0) { return log2(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.log2.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_log2_half3
 // NO_HALF: call <3 x float> @llvm.log2.v3f32(
-half3 test_log2_half3 ( half3 p0 ) {
-  return log2 ( p0 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.log2.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_log2_float4@@YAT?$__vector@M$03@__clang@@T12@@Z"(
+half3 test_log2_half3(half3 p0) { return log2(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.log2.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_log2_half4
 // NO_HALF: call <4 x float> @llvm.log2.v4f32(
-half4 test_log2_half4 ( half4 p0 ) {
-  return log2 ( p0 );
-}
+half4 test_log2_half4(half4 p0) { return log2(p0); }
 
-// CHECK: define noundef float @
+// CHECK: define noundef float @"?test_log2_float
 // CHECK: call float @llvm.log2.f32(
-float test_log2_float ( float p0 ) {
-  return log2 ( p0 );
-}
-// CHECK: define noundef <2 x float> @
+float test_log2_float(float p0) { return log2(p0); }
+// CHECK: define noundef <2 x float> @"?test_log2_float2
 // CHECK: call <2 x float> @llvm.log2.v2f32
-float2 test_log2_float2 ( float2 p0 ) {
-  return log2 ( p0 );
-}
-// CHECK: define noundef <3 x float> @
+float2 test_log2_float2(float2 p0) { return log2(p0); }
+// CHECK: define noundef <3 x float> @"?test_log2_float3
 // CHECK: call <3 x float> @llvm.log2.v3f32
-float3 test_log2_float3 ( float3 p0 ) {
-  return log2 ( p0 );
-}
-// CHECK: define noundef <4 x float> @
+float3 test_log2_float3(float3 p0) { return log2(p0); }
+// CHECK: define noundef <4 x float> @"?test_log2_float4
 // CHECK: call <4 x float> @llvm.log2.v4f32
-float4 test_log2_float4 ( float4 p0 ) {
-  return log2 ( p0 );
-}
+float4 test_log2_float4(float4 p0) { return log2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl
index d8879c3332fb88..272d1e8a10bd7c 100644
--- a/clang/test/CodeGenHLSL/builtins/max.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max.hlsl
@@ -1,206 +1,134 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// CHECK: define noundef i16 @
-// CHECK: call i16 @llvm.smax.i16(
-int16_t test_max_short ( int16_t p0, int16_t p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <2 x i16> @
-// CHECK: call <2 x i16> @llvm.smax.v2i16(
-int16_t2 test_max_short2 ( int16_t2 p0, int16_t2 p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <3 x i16> @
-// CHECK: call <3 x i16> @llvm.smax.v3i16
-int16_t3 test_max_short3 ( int16_t3 p0, int16_t3 p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <4 x i16> @
-// CHECK: call <4 x i16> @llvm.smax.v4i16
-int16_t4 test_max_short4 ( int16_t4 p0, int16_t4 p1 ) {
-  return max ( p0, p1 );
-}
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.smax.i16(
+int16_t test_max_short(int16_t p0, int16_t p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.smax.v2i16(
+int16_t2 test_max_short2(int16_t2 p0, int16_t2 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.smax.v3i16
+int16_t3 test_max_short3(int16_t3 p0, int16_t3 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.smax.v4i16
+int16_t4 test_max_short4(int16_t4 p0, int16_t4 p1) { return max(p0, p1); }
 
-// CHECK: define noundef i16 @
-// CHECK: call i16 @llvm.umax.i16(
-uint16_t test_max_ushort ( uint16_t p0, uint16_t p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <2 x i16> @
-// CHECK: call <2 x i16> @llvm.umax.v2i16
-uint16_t2 test_max_ushort2 ( uint16_t2 p0, uint16_t2 p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <3 x i16> @
-// CHECK: call <3 x i16> @llvm.umax.v3i16
-uint16_t3 test_max_ushort3 ( uint16_t3 p0, uint16_t3 p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <4 x i16> @
-// CHECK: call <4 x i16> @llvm.umax.v4i16
-uint16_t4 test_max_ushort4 ( uint16_t4 p0, uint16_t4 p1 ) {
-  return max ( p0, p1 );
-}
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.umax.i16(
+uint16_t test_max_ushort(uint16_t p0, uint16_t p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.umax.v2i16
+uint16_t2 test_max_ushort2(uint16_t2 p0, uint16_t2 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.umax.v3i16
+uint16_t3 test_max_ushort3(uint16_t3 p0, uint16_t3 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.umax.v4i16
+uint16_t4 test_max_ushort4(uint16_t4 p0, uint16_t4 p1) { return max(p0, p1); }
 #endif
 
 // CHECK: define noundef i32 @
 // CHECK: call i32 @llvm.smax.i32(
-int test_max_int ( int p0, int p1 ) {
-  return max ( p0, p1 );
-}
+int test_max_int(int p0, int p1) { return max(p0, p1); }
 // CHECK: define noundef <2 x i32> @
 // CHECK: call <2 x i32> @llvm.smax.v2i32
-int2 test_max_int2 ( int2 p0, int2 p1 ) {
-  return max ( p0, p1 );
-}
+int2 test_max_int2(int2 p0, int2 p1) { return max(p0, p1); }
 // CHECK: define noundef <3 x i32> @
 // CHECK: call <3 x i32> @llvm.smax.v3i32
-int3 test_max_int3 ( int3 p0, int3 p1 ) {
-  return max ( p0, p1 );
-}
+int3 test_max_int3(int3 p0, int3 p1) { return max(p0, p1); }
 // CHECK: define noundef <4 x i32> @
 // CHECK: call <4 x i32> @llvm.smax.v4i32
-int4 test_max_int4 ( int4 p0, int4 p1) {
-  return max ( p0, p1 );
-}
+int4 test_max_int4(int4 p0, int4 p1) { return max(p0, p1); }
 
 // CHECK: define noundef i32 @
 // CHECK: call i32 @llvm.umax.i32(
-int test_max_uint ( uint p0, uint p1 ) {
-  return max ( p0, p1 );
-}
+int test_max_uint(uint p0, uint p1) { return max(p0, p1); }
 // CHECK: define noundef <2 x i32> @
 // CHECK: call <2 x i32> @llvm.umax.v2i32
-uint2 test_max_uint2 ( uint2 p0, uint2 p1 ) {
-  return max ( p0, p1 );
-}
+uint2 test_max_uint2(uint2 p0, uint2 p1) { return max(p0, p1); }
 // CHECK: define noundef <3 x i32> @
 // CHECK: call <3 x i32> @llvm.umax.v3i32
-uint3 test_max_uint3 ( uint3 p0, uint3 p1 ) {
-  return max ( p0, p1 );
-}
+uint3 test_max_uint3(uint3 p0, uint3 p1) { return max(p0, p1); }
 // CHECK: define noundef <4 x i32> @
 // CHECK: call <4 x i32> @llvm.umax.v4i32
-uint4 test_max_uint4 ( uint4 p0, uint4 p1) {
-  return max ( p0, p1 );
-}
+uint4 test_max_uint4(uint4 p0, uint4 p1) { return max(p0, p1); }
 
 // CHECK: define noundef i64 @
 // CHECK: call i64 @llvm.smax.i64(
-int64_t test_max_long ( int64_t p0, int64_t p1 ) {
-  return max ( p0, p1 );
-}
+int64_t test_max_long(int64_t p0, int64_t p1) { return max(p0, p1); }
 // CHECK: define noundef <2 x i64> @
 // CHECK: call <2 x i64> @llvm.smax.v2i64
-int64_t2 test_max_long2 ( int64_t2 p0, int64_t2 p1 ) {
-  return max ( p0, p1 );
-}
+int64_t2 test_max_long2(int64_t2 p0, int64_t2 p1) { return max(p0, p1); }
 // CHECK: define noundef <3 x i64> @
 // CHECK: call <3 x i64> @llvm.smax.v3i64
-int64_t3 test_max_long3 ( int64_t3 p0, int64_t3 p1 ) {
-  return max ( p0, p1 );
-}
+int64_t3 test_max_long3(int64_t3 p0, int64_t3 p1) { return max(p0, p1); }
 // CHECK: define noundef <4 x i64> @
 // CHECK: call <4 x i64> @llvm.smax.v4i64
-int64_t4 test_max_long4 ( int64_t4 p0, int64_t4 p1) {
-  return max ( p0, p1 );
-}
+int64_t4 test_max_long4(int64_t4 p0, int64_t4 p1) { return max(p0, p1); }
 
 // CHECK: define noundef i64 @
 // CHECK: call i64 @llvm.umax.i64(
-uint64_t test_max_long ( uint64_t p0, uint64_t p1 ) {
-  return max ( p0, p1 );
-}
+uint64_t test_max_long(uint64_t p0, uint64_t p1) { return max(p0, p1); }
 // CHECK: define noundef <2 x i64> @
 // CHECK: call <2 x i64> @llvm.umax.v2i64
-uint64_t2 test_max_long2 ( uint64_t2 p0, uint64_t2 p1 ) {
-  return max ( p0, p1 );
-}
+uint64_t2 test_max_long2(uint64_t2 p0, uint64_t2 p1) { return max(p0, p1); }
 // CHECK: define noundef <3 x i64> @
 // CHECK: call <3 x i64> @llvm.umax.v3i64
-uint64_t3 test_max_long3 ( uint64_t3 p0, uint64_t3 p1 ) {
-  return max ( p0, p1 );
-}
+uint64_t3 test_max_long3(uint64_t3 p0, uint64_t3 p1) { return max(p0, p1); }
 // CHECK: define noundef <4 x i64> @
 // CHECK: call <4 x i64> @llvm.umax.v4i64
-uint64_t4 test_max_long4 ( uint64_t4 p0, uint64_t4 p1) {
-  return max ( p0, p1 );
-}
+uint64_t4 test_max_long4(uint64_t4 p0, uint64_t4 p1) { return max(p0, p1); }
 
-
-// CHECK: define noundef half @
-// CHECK: call half @llvm.maxnum.f16(
-// NO_HALF: define noundef float @"?test_max_half@@YA$halff@$halff@0@Z"(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.maxnum.f16(
+// NO_HALF: define noundef float @"?test_max_half
 // NO_HALF: call float @llvm.maxnum.f32(
-half test_max_half ( half p0, half p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.maxnum.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_max_float2@@YAT?$__vector@M$01@__clang@@T12@0@Z"(
+half test_max_half(half p0, half p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.maxnum.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_max_half2
 // NO_HALF: call <2 x float> @llvm.maxnum.v2f32(
-half2 test_max_half2 ( half2 p0, half2 p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.maxnum.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_max_float3@@YAT?$__vector@M$02@__clang@@T12@0@Z"(
+half2 test_max_half2(half2 p0, half2 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.maxnum.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_max_half3
 // NO_HALF: call <3 x float> @llvm.maxnum.v3f32(
-half3 test_max_half3 ( half3 p0, half3 p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.maxnum.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_max_float4@@YAT?$__vector@M$03@__clang@@T12@0@Z"(
+half3 test_max_half3(half3 p0, half3 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.maxnum.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_max_half4
 // NO_HALF: call <4 x float> @llvm.maxnum.v4f32(
-half4 test_max_half4 ( half4 p0, half4 p1 ) {
-  return max ( p0, p1 );
-}
+half4 test_max_half4(half4 p0, half4 p1) { return max(p0, p1); }
 
-// CHECK: define noundef float @
+// CHECK: define noundef float @"?test_max_float
 // CHECK: call float @llvm.maxnum.f32(
-float test_max_float ( float p0, float p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <2 x float> @
+float test_max_float(float p0, float p1) { return max(p0, p1); }
+// CHECK: define noundef <2 x float> @"?test_max_float2
 // CHECK: call <2 x float> @llvm.maxnum.v2f32
-float2 test_max_float2 ( float2 p0, float2 p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <3 x float> @
+float2 test_max_float2(float2 p0, float2 p1) { return max(p0, p1); }
+// CHECK: define noundef <3 x float> @"?test_max_float3
 // CHECK: call <3 x float> @llvm.maxnum.v3f32
-float3 test_max_float3 ( float3 p0, float3 p1 ) {
-  return max ( p0, p1 );
-}
-// CHECK: define noundef <4 x float> @
+float3 test_max_float3(float3 p0, float3 p1) { return max(p0, p1); }
+// CHECK: define noundef <4 x float> @"?test_max_float4
 // CHECK: call <4 x float> @llvm.maxnum.v4f32
-float4 test_max_float4 ( float4 p0, float4 p1) {
-  return max ( p0, p1 );
-}
+float4 test_max_float4(float4 p0, float4 p1) { return max(p0, p1); }
 
 // CHECK: define noundef double @
 // CHECK: call double @llvm.maxnum.f64(
-double test_max_double ( double p0, double p1 ) {
-  return max ( p0, p1 );
-}
+double test_max_double(double p0, double p1) { return max(p0, p1); }
 // CHECK: define noundef <2 x double> @
 // CHECK: call <2 x double> @llvm.maxnum.v2f64
-double2 test_max_double2 ( double2 p0, double2 p1 ) {
-  return max ( p0, p1 );
-}
+double2 test_max_double2(double2 p0, double2 p1) { return max(p0, p1); }
 // CHECK: define noundef <3 x double> @
 // CHECK: call <3 x double> @llvm.maxnum.v3f64
-double3 test_max_double3 ( double3 p0, double3 p1 ) {
-  return max ( p0, p1 );
-}
+double3 test_max_double3(double3 p0, double3 p1) { return max(p0, p1); }
 // CHECK: define noundef <4 x double> @
 // CHECK: call <4 x double> @llvm.maxnum.v4f64
-double4 test_max_double4 ( double4 p0, double4 p1) {
-  return max ( p0, p1 );
-}
+double4 test_max_double4(double4 p0, double4 p1) { return max(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/min.hlsl b/clang/test/CodeGenHLSL/builtins/min.hlsl
index 743053cbdd2620..a0c233dac4d5fc 100644
--- a/clang/test/CodeGenHLSL/builtins/min.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min.hlsl
@@ -1,207 +1,134 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// CHECK: define noundef i16 @
-// CHECK: call i16 @llvm.smin.i16(
-int16_t test_min_short ( int16_t p0, int16_t p1 ) {
-  return min ( p0, p1 );
-}
-// CHECK: define noundef <2 x i16> @
-// CHECK: call <2 x i16> @llvm.smin.v2i16(
-int16_t2 test_min_short2 ( int16_t2 p0, int16_t2 p1 ) {
-  return min ( p0, p1 );
-}
-// CHECK: define noundef <3 x i16> @
-// CHECK: call <3 x i16> @llvm.smin.v3i16
-int16_t3 test_min_short3 ( int16_t3 p0, int16_t3 p1 ) {
-  return min ( p0, p1 );
-}
-// CHECK: define noundef <4 x i16> @
-// CHECK: call <4 x i16> @llvm.smin.v4i16
-int16_t4 test_min_short4 ( int16_t4 p0, int16_t4 p1 ) {
-  return min ( p0, p1 );
-}
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.smin.i16(
+int16_t test_min_short(int16_t p0, int16_t p1) { return min(p0, p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.smin.v2i16(
+int16_t2 test_min_short2(int16_t2 p0, int16_t2 p1) { return min(p0, p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.smin.v3i16
+int16_t3 test_min_short3(int16_t3 p0, int16_t3 p1) { return min(p0, p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.smin.v4i16
+int16_t4 test_min_short4(int16_t4 p0, int16_t4 p1) { return min(p0, p1); }
 
-
-// CHECK: define noundef i16 @
-// CHECK: call i16 @llvm.umin.i16(
-uint16_t test_min_ushort ( uint16_t p0, uint16_t p1 ) {
-  return min ( p0, p1 );
-}
-// CHECK: define noundef <2 x i16> @
-// CHECK: call <2 x i16> @llvm.umin.v2i16
-uint16_t2 test_min_ushort2 ( uint16_t2 p0, uint16_t2 p1 ) {
-  return min ( p0, p1 );
-}
-// CHECK: define noundef <3 x i16> @
-// CHECK: call <3 x i16> @llvm.umin.v3i16
-uint16_t3 test_min_ushort3 ( uint16_t3 p0, uint16_t3 p1 ) {
-  return min ( p0, p1 );
-}
-// CHECK: define noundef <4 x i16> @
-// CHECK: call <4 x i16> @llvm.umin.v4i16
-uint16_t4 test_min_ushort4 ( uint16_t4 p0, uint16_t4 p1 ) {
-  return min ( p0, p1 );
-}
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.umin.i16(
+uint16_t test_min_ushort(uint16_t p0, uint16_t p1) { return min(p0, p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.umin.v2i16
+uint16_t2 test_min_ushort2(uint16_t2 p0, uint16_t2 p1) { return min(p0, p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.umin.v3i16
+uint16_t3 test_min_ushort3(uint16_t3 p0, uint16_t3 p1) { return min(p0, p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.umin.v4i16
+uint16_t4 test_min_ushort4(uint16_t4 p0, uint16_t4 p1) { return min(p0, p1); }
 #endif
 
 // CHECK: define noundef i32 @
 // CHECK: call i32 @llvm.smin.i32(
-int test_min_int ( int p0, int p1 ) {
-  return min ( p0, p1 );
-}
+int test_min_int(int p0, int p1) { return min(p0, p1); }
 // CHECK: define noundef <2 x i32> @
 // CHECK: call <2 x i32> @llvm.smin.v2i32
-int2 test_min_int2 ( int2 p0, int2 p1 ) {
-  return min ( p0, p1 );
-}
+int2 test_min_int2(int2 p0, int2 p1) { return min(p0, p1); }
 // CHECK: define noundef <3 x i32> @
 // CHECK: call <3 x i32> @llvm.smin.v3i32
-int3 test_min_int3 ( int3 p0, int3 p1 ) {
-  return min ( p0, p1 );
-}
+int3 test_min_int3(int3 p0, int3 p1) { return min(p0, p1); }
 // CHECK: define noundef <4 x i32> @
 // CHECK: call <4 x i32> @llvm.smin.v4i32
-int4 test_min_int4 ( int4 p0, int4 p1) {
-  return min ( p0, p1 );
-}
+int4 test_min_int4(int4 p0, int4 p1) { return min(p0, p1); }
 
 // CHECK: define noundef i32 @
 // CHECK: call i32 @llvm.umin.i32(
-int test_min_uint ( uint p0, uint p1 ) {
-  return min ( p0, p1 );
-}
+int test_min_uint(uint p0, uint p1) { return min(p0, p1); }
 // CHECK: define noundef <2 x i32> @
 // CHECK: call <2 x i32> @llvm.umin.v2i32
-uint2 test_min_uint2 ( uint2 p0, uint2 p1 ) {
-  return min ( p0, p1 );
-}
+uint2 test_min_uint2(uint2 p0, uint2 p1) { return min(p0, p1); }
 // CHECK: define noundef <3 x i32> @
 // CHECK: call <3 x i32> @llvm.umin.v3i32
-uint3 test_min_uint3 ( uint3 p0, uint3 p1 ) {
-  return min ( p0, p1 );
-}
+uint3 test_min_uint3(uint3 p0, uint3 p1) { return min(p0, p1); }
 // CHECK: define noundef <4 x i32> @
 // CHECK: call <4 x i32> @llvm.umin.v4i32
-uint4 test_min_uint4 ( uint4 p0, uint4 p1) {
-  return min ( p0, p1 );
-}
+uint4 test_min_uint4(uint4 p0, uint4 p1) { return min(p0, p1); }
 
 // CHECK: define noundef i64 @
 // CHECK: call i64 @llvm.smin.i64(
-int64_t test_min_long ( int64_t p0, int64_t p1 ) {
-  return min ( p0, p1 );
-}
+int64_t test_min_long(int64_t p0, int64_t p1) { return min(p0, p1); }
 // CHECK: define noundef <2 x i64> @
 // CHECK: call <2 x i64> @llvm.smin.v2i64
-int64_t2 test_min_long2 ( int64_t2 p0, int64_t2 p1 ) {
-  return min ( p0, p1 );
-}
+int64_t2 test_min_long2(int64_t2 p0, int64_t2 p1) { return min(p0, p1); }
 // CHECK: define noundef <3 x i64> @
 // CHECK: call <3 x i64> @llvm.smin.v3i64
-int64_t3 test_min_long3 ( int64_t3 p0, int64_t3 p1 ) {
-  return min ( p0, p1 );
-}
+int64_t3 test_min_long3(int64_t3 p0, int64_t3 p1) { return min(p0, p1); }
 // CHECK: define noundef <4 x i64> @
 // CHECK: call <4 x i64> @llvm.smin.v4i64
-int64_t4 test_min_long4 ( int64_t4 p0, int64_t4 p1) {
-  return min ( p0, p1 );
-}
+int64_t4 test_min_long4(int64_t4 p0, int64_t4 p1) { return min(p0, p1); }
 
 // CHECK: define noundef i64 @
 // CHECK: call i64 @llvm.umin.i64(
-uint64_t test_min_long ( uint64_t p0, uint64_t p1 ) {
-  return min ( p0, p1 );
-}
+uint64_t test_min_long(uint64_t p0, uint64_t p1) { return min(p0, p1); }
 // CHECK: define noundef <2 x i64> @
 // CHECK: call <2 x i64> @llvm.umin.v2i64
-uint64_t2 test_min_long2 ( uint64_t2 p0, uint64_t2 p1 ) {
-  return min ( p0, p1 );
-}
+uint64_t2 test_min_long2(uint64_t2 p0, uint64_t2 p1) { return min(p0, p1); }
 // CHECK: define noundef <3 x i64> @
 // CHECK: call <3 x i64> @llvm.umin.v3i64
-uint64_t3 test_min_long3 ( uint64_t3 p0, uint64_t3 p1 ) {
-  return min ( p0, p1 );
-}
+uint64_t3 test_min_long3(uint64_t3 p0, uint64_t3 p1) { return min(p0, p1); }
 // CHECK: define noundef <4 x i64> @
 // CHECK: call <4 x i64> @llvm.umin.v4i64
-uint64_t4 test_min_long4 ( uint64_t4 p0, uint64_t4 p1) {
-  return min ( p0, p1 );
-}
-
+uint64_t4 test_min_long4(uint64_t4 p0, uint64_t4 p1) { return min(p0, p1); }
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.minnum.f16(
-// NO_HALF: define noundef float @"?test_min_half@@YA$halff@$halff@0@Z"(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.minnum.f16(
+// NO_HALF: define noundef float @"?test_min_half
 // NO_HALF: call float @llvm.minnum.f32(
-half test_min_half ( half p0, half p1 ) {
-  return min ( p0, p1 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.minnum.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_min_float2@@YAT?$__vector@M$01@__clang@@T12@0@Z"(
+half test_min_half(half p0, half p1) { return min(p0, p1); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.minnum.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_min_half2
 // NO_HALF: call <2 x float> @llvm.minnum.v2f32(
-half2 test_min_half2 ( half2 p0, half2 p1 ) {
-  return min ( p0, p1 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.minnum.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_min_float3@@YAT?$__vector@M$02@__clang@@T12@0@Z"(
+half2 test_min_half2(half2 p0, half2 p1) { return min(p0, p1); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.minnum.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_min_half3
 // NO_HALF: call <3 x float> @llvm.minnum.v3f32(
-half3 test_min_half3 ( half3 p0, half3 p1 ) {
-  return min ( p0, p1 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.minnum.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_min_float4@@YAT?$__vector@M$03@__clang@@T12@0@Z"(
+half3 test_min_half3(half3 p0, half3 p1) { return min(p0, p1); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.minnum.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_min_half4
 // NO_HALF: call <4 x float> @llvm.minnum.v4f32(
-half4 test_min_half4 ( half4 p0, half4 p1 ) {
-  return min ( p0, p1 );
-}
+half4 test_min_half4(half4 p0, half4 p1) { return min(p0, p1); }
 
 // CHECK: define noundef float @
 // CHECK: call float @llvm.minnum.f32(
-float test_min_float ( float p0, float p1 ) {
-  return min ( p0, p1 );
-}
+float test_min_float(float p0, float p1) { return min(p0, p1); }
 // CHECK: define noundef <2 x float> @
 // CHECK: call <2 x float> @llvm.minnum.v2f32
-float2 test_min_float2 ( float2 p0, float2 p1 ) {
-  return min ( p0, p1 );
-}
+float2 test_min_float2(float2 p0, float2 p1) { return min(p0, p1); }
 // CHECK: define noundef <3 x float> @
 // CHECK: call <3 x float> @llvm.minnum.v3f32
-float3 test_min_float3 ( float3 p0, float3 p1 ) {
-  return min ( p0, p1 );
-}
+float3 test_min_float3(float3 p0, float3 p1) { return min(p0, p1); }
 // CHECK: define noundef <4 x float> @
 // CHECK: call <4 x float> @llvm.minnum.v4f32
-float4 test_min_float4 ( float4 p0, float4 p1) {
-  return min ( p0, p1 );
-}
+float4 test_min_float4(float4 p0, float4 p1) { return min(p0, p1); }
 
 // CHECK: define noundef double @
 // CHECK: call double @llvm.minnum.f64(
-double test_min_double ( double p0, double p1 ) {
-  return min ( p0, p1 );
-}
+double test_min_double(double p0, double p1) { return min(p0, p1); }
 // CHECK: define noundef <2 x double> @
 // CHECK: call <2 x double> @llvm.minnum.v2f64
-double2 test_min_double2 ( double2 p0, double2 p1 ) {
-  return min ( p0, p1 );
-}
+double2 test_min_double2(double2 p0, double2 p1) { return min(p0, p1); }
 // CHECK: define noundef <3 x double> @
 // CHECK: call <3 x double> @llvm.minnum.v3f64
-double3 test_min_double3 ( double3 p0, double3 p1 ) {
-  return min ( p0, p1 );
-}
+double3 test_min_double3(double3 p0, double3 p1) { return min(p0, p1); }
 // CHECK: define noundef <4 x double> @
 // CHECK: call <4 x double> @llvm.minnum.v4f64
-double4 test_min_double4 ( double4 p0, double4 p1) {
-  return min ( p0, p1 );
-}
+double4 test_min_double4(double4 p0, double4 p1) { return min(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl
index 86bfe98058a6eb..e996ca2f336410 100644
--- a/clang/test/CodeGenHLSL/builtins/pow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl
@@ -1,89 +1,54 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.pow.f16(
-// NO_HALF: define noundef float @"?test_pow_half@@YA$halff@$halff@0@Z"(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.pow.f16(
+// NO_HALF: define noundef float @"?test_pow_half
 // NO_HALF: call float @llvm.pow.f32(
-half test_pow_half(half p0, half p1)
-{
-    return pow(p0, p1);
-}
-// CHECK: define noundef <2 x half> @"?test_pow_half2@@YAT?$__vector@$f16@$01@__clang@@T12@0@Z"(
-// CHECK: call <2 x half> @llvm.pow.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_pow_float2@@YAT?$__vector@M$01@__clang@@T12@0@Z"(
+half test_pow_half(half p0, half p1) { return pow(p0, p1); }
+// NATIVE_HALF: define noundef <2 x half> @"?test_pow_half2
+// NATIVE_HALF: call <2 x half> @llvm.pow.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_pow_half2
 // NO_HALF: call <2 x float> @llvm.pow.v2f32(
-half2 test_pow_half2(half2 p0, half2 p1)
-{
-    return pow(p0, p1);
-}
-// CHECK: define noundef <3 x half> @"?test_pow_half3@@YAT?$__vector@$f16@$02@__clang@@T12@0@Z"(
-// CHECK: call <3 x half> @llvm.pow.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_pow_float3@@YAT?$__vector@M$02@__clang@@T12@0@Z"(
+half2 test_pow_half2(half2 p0, half2 p1) { return pow(p0, p1); }
+// NATIVE_HALF: define noundef <3 x half> @"?test_pow_half3
+// NATIVE_HALF: call <3 x half> @llvm.pow.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_pow_half3
 // NO_HALF: call <3 x float> @llvm.pow.v3f32(
-half3 test_pow_half3(half3 p0, half3 p1)
-{
-    return pow(p0, p1);
-}
-// CHECK: define noundef <4 x half> @"?test_pow_half4@@YAT?$__vector@$f16@$03@__clang@@T12@0@Z"(
-// CHECK: call <4 x half> @llvm.pow.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_pow_float4@@YAT?$__vector@M$03@__clang@@T12@0@Z"(
+half3 test_pow_half3(half3 p0, half3 p1) { return pow(p0, p1); }
+// NATIVE_HALF: define noundef <4 x half> @"?test_pow_half4
+// NATIVE_HALF: call <4 x half> @llvm.pow.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_pow_half4
 // NO_HALF: call <4 x float> @llvm.pow.v4f32(
-half4 test_pow_half4(half4 p0, half4 p1)
-{
-    return pow(p0, p1);
-}
+half4 test_pow_half4(half4 p0, half4 p1) { return pow(p0, p1); }
 
-// CHECK: define noundef float @"?test_pow_float@@YAMMM@Z"(
+// CHECK: define noundef float @"?test_pow_float
 // CHECK: call float @llvm.pow.f32(
-float test_pow_float(float p0, float p1)
-{
-    return pow(p0, p1);
-}
-// CHECK: define noundef <2 x float> @"?test_pow_float2@@YAT?$__vector@M$01@__clang@@T12@0@Z"(
+float test_pow_float(float p0, float p1) { return pow(p0, p1); }
+// CHECK: define noundef <2 x float> @"?test_pow_float2
 // CHECK: call <2 x float> @llvm.pow.v2f32
-float2 test_pow_float2(float2 p0, float2 p1)
-{
-    return pow(p0, p1);
-}
-// CHECK: define noundef <3 x float> @"?test_pow_float3@@YAT?$__vector@M$02@__clang@@T12@0@Z"(
+float2 test_pow_float2(float2 p0, float2 p1) { return pow(p0, p1); }
+// CHECK: define noundef <3 x float> @"?test_pow_float3
 // CHECK: call <3 x float> @llvm.pow.v3f32
-float3 test_pow_float3(float3 p0, float3 p1)
-{
-    return pow(p0, p1);
-}
-// CHECK: define noundef <4 x float> @"?test_pow_float4@@YAT?$__vector@M$03@__clang@@T12@0@Z"(
+float3 test_pow_float3(float3 p0, float3 p1) { return pow(p0, p1); }
+// CHECK: define noundef <4 x float> @"?test_pow_float4
 // CHECK: call <4 x float> @llvm.pow.v4f32
-float4 test_pow_float4(float4 p0, float4 p1)
-{
-    return pow(p0, p1);
-}
+float4 test_pow_float4(float4 p0, float4 p1) { return pow(p0, p1); }
 
 // CHECK: define noundef double @"?test_pow_double@@YANNN@Z"(
 // CHECK: call double @llvm.pow.f64(
-double test_pow_double(double p0, double p1)
-{
-    return pow(p0, p1);
-}
+double test_pow_double(double p0, double p1) { return pow(p0, p1); }
 // CHECK: define noundef <2 x double> @"?test_pow_double2@@YAT?$__vector@N$01@__clang@@T12@0@Z"(
 // CHECK: call <2 x double> @llvm.pow.v2f64
-double2 test_pow_double2(double2 p0, double2 p1)
-{
-    return pow(p0, p1);
-}
+double2 test_pow_double2(double2 p0, double2 p1) { return pow(p0, p1); }
 // CHECK: define noundef <3 x double> @"?test_pow_double3@@YAT?$__vector@N$02@__clang@@T12@0@Z"(
 // CHECK: call <3 x double> @llvm.pow.v3f64
-double3 test_pow_double3(double3 p0, double3 p1)
-{
-    return pow(p0, p1);
-}
+double3 test_pow_double3(double3 p0, double3 p1) { return pow(p0, p1); }
 // CHECK: define noundef <4 x double> @"?test_pow_double4@@YAT?$__vector@N$03@__clang@@T12@0@Z"(
 // CHECK: call <4 x double> @llvm.pow.v4f64
-double4 test_pow_double4(double4 p0, double4 p1)
-{
-    return pow(p0, p1);
-}
+double4 test_pow_double4(double4 p0, double4 p1) { return pow(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl
index 2445e6063a7052..ffb52214913886 100644
--- a/clang/test/CodeGenHLSL/builtins/sin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl
@@ -1,56 +1,41 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes  -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.sin.f16(
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.sin.f16(
 // NO_HALF: define noundef float @"?test_sin_half@@YA$halff@$halff@@Z"(
 // NO_HALF: call float @llvm.sin.f32(
-half test_sin_half ( half p0 ) {
-  return sin ( p0 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.sin.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_sin_float2@@YAT?$__vector@M$01@__clang@@T12@@Z"(
+half test_sin_half(half p0) { return sin(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.sin.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_sin_half2
 // NO_HALF: call <2 x float> @llvm.sin.v2f32(
-half2 test_sin_half2 ( half2 p0 ) {
-  return sin ( p0 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.sin.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_sin_float3@@YAT?$__vector@M$02@__clang@@T12@@Z"(
+half2 test_sin_half2(half2 p0) { return sin(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.sin.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_sin_half3
 // NO_HALF: call <3 x float> @llvm.sin.v3f32(
-half3 test_sin_half3 ( half3 p0 ) {
-  return sin ( p0 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.sin.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_sin_float4@@YAT?$__vector@M$03@__clang@@T12@@Z"(
+half3 test_sin_half3(half3 p0) { return sin(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.sin.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_sin_half4
 // NO_HALF: call <4 x float> @llvm.sin.v4f32(
-half4 test_sin_half4 ( half4 p0 ) {
-  return sin ( p0 );
-}
+half4 test_sin_half4(half4 p0) { return sin(p0); }
 
 // CHECK: define noundef float @
 // CHECK: call float @llvm.sin.f32(
-float test_sin_float ( float p0 ) {
-  return sin ( p0 );
-}
+float test_sin_float(float p0) { return sin(p0); }
 // CHECK: define noundef <2 x float> @
 // CHECK: call <2 x float> @llvm.sin.v2f32
-float2 test_sin_float2 ( float2 p0 ) {
-  return sin ( p0 );
-}
+float2 test_sin_float2(float2 p0) { return sin(p0); }
 // CHECK: define noundef <3 x float> @
 // CHECK: call <3 x float> @llvm.sin.v3f32
-float3 test_sin_float3 ( float3 p0 ) {
-  return sin ( p0 );
-}
+float3 test_sin_float3(float3 p0) { return sin(p0); }
 // CHECK: define noundef <4 x float> @
 // CHECK: call <4 x float> @llvm.sin.v4f32
-float4 test_sin_float4 ( float4 p0 ) {
-  return sin ( p0 );
-}
+float4 test_sin_float4(float4 p0) { return sin(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
index 4ae3cd20257ec0..6078aae5f873fe 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
@@ -1,56 +1,47 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK: define noundef half @
-// CHECK: call half @llvm.trunc.f16(
-// NO_HALF: define noundef float @"?test_trunc_half@@YA$halff@$halff@@Z"(
+// NATIVE_HALF: define noundef half @"?test_trunc_half
+// NATIVE_HALF: call half @llvm.trunc.f16(
+// NO_HALF: define noundef float @"?test_trunc_half
 // NO_HALF: call float @llvm.trunc.f32(
-half test_trunc_half ( half p0 ) {
-  return trunc ( p0 );
-}
-// CHECK: define noundef <2 x half> @
-// CHECK: call <2 x half> @llvm.trunc.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_trunc_float2@@YAT?$__vector@M$01@__clang@@T12@@Z"(
+half test_trunc_half(half p0) { return trunc(p0); }
+
+// NATIVE_HALF: define noundef <2 x half> @"?test_trunc_half2
+// NATIVE_HALF: call <2 x half> @llvm.trunc.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_trunc_half2
 // NO_HALF: call <2 x float> @llvm.trunc.v2f32(
-half2 test_trunc_half2 ( half2 p0 ) {
-  return trunc ( p0 );
-}
-// CHECK: define noundef <3 x half> @
-// CHECK: call <3 x half> @llvm.trunc.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_trunc_float3@@YAT?$__vector@M$02@__clang@@T12@@Z"(
+half2 test_trunc_half2(half2 p0) { return trunc(p0); }
+
+// NATIVE_HALF: define noundef <3 x half> @"?test_trunc_half3
+// NATIVE_HALF: call <3 x half> @llvm.trunc.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_trunc_half3
 // NO_HALF: call <3 x float> @llvm.trunc.v3f32(
-half3 test_trunc_half3 ( half3 p0 ) {
-  return trunc ( p0 );
-}
-// CHECK: define noundef <4 x half> @
-// CHECK: call <4 x half> @llvm.trunc.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_trunc_float4@@YAT?$__vector@M$03@__clang@@T12@@Z"(
+half3 test_trunc_half3(half3 p0) { return trunc(p0); }
+
+// NATIVE_HALF: define noundef <4 x half> @"?test_trunc_half4
+// NATIVE_HALF: call <4 x half> @llvm.trunc.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_trunc_half4
 // NO_HALF: call <4 x float> @llvm.trunc.v4f32(
-half4 test_trunc_half4 ( half4 p0 ) {
-  return trunc ( p0 );
-}
+half4 test_trunc_half4(half4 p0) { return trunc(p0); }
 
-// CHECK: define noundef float @
+// CHECK: define noundef float @"?test_trunc_float
 // CHECK: call float @llvm.trunc.f32(
-float test_trunc_float ( float p0 ) {
-  return trunc ( p0 );
-}
-// CHECK: define noundef <2 x float> @
+float test_trunc_float(float p0) { return trunc(p0); }
+
+// CHECK: define noundef <2 x float> @"?test_trunc_float2
 // CHECK: call <2 x float> @llvm.trunc.v2f32
-float2 test_trunc_float2 ( float2 p0 ) {
-  return trunc ( p0 );
-}
-// CHECK: define noundef <3 x float> @
+float2 test_trunc_float2(float2 p0) { return trunc(p0); }
+
+// CHECK: define noundef <3 x float> @"?test_trunc_float3
 // CHECK: call <3 x float> @llvm.trunc.v3f32
-float3 test_trunc_float3 ( float3 p0 ) {
-  return trunc ( p0 );
-}
-// CHECK: define noundef <4 x float> @
+float3 test_trunc_float3(float3 p0) { return trunc(p0); }
+
+// CHECK: define noundef <4 x float> @"?test_trunc_float4
 // CHECK: call <4 x float> @llvm.trunc.v4f32
-float4 test_trunc_float4 ( float4 p0 ) {
-  return trunc ( p0 );
-}
+float4 test_trunc_float4(float4 p0) { return trunc(p0); }
diff --git a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
index 81fedc2de31570..2ea7d14e80eebf 100644
--- a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
+++ b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
@@ -40,7 +40,7 @@ void Fn3( int64_t2 p0);
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'half2':'half __attribute__((ext_vector_type(2)))' <LValueToRValue>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'half2':'half __attribute__((ext_vector_type(2)))' lvalue ParmVar {{.*}} 'p0' 'half2':'half __attribute__((ext_vector_type(2)))'
 // CHECKIR-LABEL: Call3
-// CHECKIR: %conv = fptosi <2 x half> {{.*}} to <2 x i64>
+// CHECKIR: {{.*}} = fptosi <2 x half> {{.*}} to <2 x i64>
 void Call3(half2 p0) {
   Fn3(p0);
 }

From de8e2b7b8649ffa516d357a646eacbce8e69e39e Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 29 Feb 2024 18:53:02 +0000
Subject: [PATCH 52/55] [test][SROA] Regenerate vector-promotion.ll

---
 llvm/test/Transforms/SROA/vector-promotion.ll | 364 +++++++++---------
 1 file changed, 182 insertions(+), 182 deletions(-)

diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index e48dd5bb392082..1691f7733acea5 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -22,21 +22,21 @@ define i32 @test1(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; DEBUG-LABEL: @test1(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG22:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META12:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG22:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META12:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2, !dbg [[DBG24:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG24]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG25:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG24]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG25:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_2_28_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3, !dbg [[DBG26:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_2_28_VEC_EXTRACT]], metadata [[META16:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META17:![0-9]+]], metadata !DIExpression()), !dbg [[DBG27:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_2_28_VEC_EXTRACT]], metadata [[META16:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META17:![0-9]+]], metadata !DIExpression()), !dbg [[DBG27:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_2_16_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[Y]], i32 0, !dbg [[DBG28:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_2_16_VEC_EXTRACT]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG28]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_2_16_VEC_EXTRACT]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG28]]
 ; DEBUG-NEXT:    [[TMP4:%.*]] = add i32 [[A_SROA_0_8_VEC_EXTRACT]], [[A_SROA_2_28_VEC_EXTRACT]], !dbg [[DBG29:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META19:![0-9]+]], metadata !DIExpression()), !dbg [[DBG29]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META19:![0-9]+]], metadata !DIExpression()), !dbg [[DBG29]]
 ; DEBUG-NEXT:    [[TMP5:%.*]] = add i32 [[A_SROA_2_16_VEC_EXTRACT]], [[TMP4]], !dbg [[DBG30:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG30]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG30]]
 ; DEBUG-NEXT:    ret i32 [[TMP5]], !dbg [[DBG31:![0-9]+]]
 ;
 entry:
@@ -71,23 +71,23 @@ define i32 @test2(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; DEBUG-LABEL: @test2(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META34:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META35:![0-9]+]], metadata !DIExpression()), !dbg [[DBG46:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META36:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META34:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META35:![0-9]+]], metadata !DIExpression()), !dbg [[DBG46:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META36:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2, !dbg [[DBG48:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META37:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META38:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META37:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META38:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_2_28_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3, !dbg [[DBG50:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_2_28_VEC_EXTRACT]], metadata [[META39:![0-9]+]], metadata !DIExpression()), !dbg [[DBG50]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_2_28_VEC_EXTRACT]], metadata [[META39:![0-9]+]], metadata !DIExpression()), !dbg [[DBG50]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_2_16_VEC_EXTRACT:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>, !dbg [[DBG52:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <2 x i32> [[A_SROA_2_16_VEC_EXTRACT]], metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <2 x i32> [[A_SROA_2_16_VEC_EXTRACT]], metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52]]
 ; DEBUG-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[A_SROA_2_16_VEC_EXTRACT]], i32 0, !dbg [[DBG53:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP3]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP3]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53]]
 ; DEBUG-NEXT:    [[TMP4:%.*]] = add i32 [[A_SROA_0_8_VEC_EXTRACT]], [[A_SROA_2_28_VEC_EXTRACT]], !dbg [[DBG54:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META43:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META43:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54]]
 ; DEBUG-NEXT:    [[TMP5:%.*]] = add i32 [[TMP3]], [[TMP4]], !dbg [[DBG55:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META44:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META44:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
 ; DEBUG-NEXT:    ret i32 [[TMP5]], !dbg [[DBG56:![0-9]+]]
 ;
 entry:
@@ -123,22 +123,22 @@ define i32 @test3(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; DEBUG-LABEL: @test3(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META60:![0-9]+]], metadata !DIExpression()), !dbg [[DBG70:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META61:![0-9]+]], metadata !DIExpression()), !dbg [[DBG71:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META60:![0-9]+]], metadata !DIExpression()), !dbg [[DBG70:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META61:![0-9]+]], metadata !DIExpression()), !dbg [[DBG71:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X:%.*]], i32 -1, i32 2, !dbg [[DBG72:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[A_SROA_0_8_VEC_INSERT]], i32 2, !dbg [[DBG73:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META62:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META62:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_3_28_VEC_EXTRACT:%.*]] = extractelement <4 x i32> zeroinitializer, i32 3, !dbg [[DBG75:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_3_28_VEC_EXTRACT]], metadata [[META64:![0-9]+]], metadata !DIExpression()), !dbg [[DBG75]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META65:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_3_28_VEC_EXTRACT]], metadata [[META64:![0-9]+]], metadata !DIExpression()), !dbg [[DBG75]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META65:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_3_16_VEC_EXTRACT:%.*]] = extractelement <4 x i32> zeroinitializer, i32 0, !dbg [[DBG77:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_3_16_VEC_EXTRACT]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_3_16_VEC_EXTRACT]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77]]
 ; DEBUG-NEXT:    [[TMP4:%.*]] = add i32 [[A_SROA_0_8_VEC_EXTRACT]], [[A_SROA_3_28_VEC_EXTRACT]], !dbg [[DBG78:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78]]
 ; DEBUG-NEXT:    [[TMP5:%.*]] = add i32 [[A_SROA_3_16_VEC_EXTRACT]], [[TMP4]], !dbg [[DBG79:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
 ; DEBUG-NEXT:    ret i32 [[TMP5]], !dbg [[DBG80:![0-9]+]]
 ;
 entry:
@@ -179,26 +179,26 @@ define i32 @test4(<4 x i32> %x, <4 x i32> %y, ptr %z) {
 ;
 ; DEBUG-LABEL: @test4(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG94:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG95:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG94:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG95:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_3_16_COPYLOAD:%.*]] = load <4 x i32>, ptr [[Z:%.*]], align 1, !dbg [[DBG96:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG97:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG97:![0-9]+]]
 ; DEBUG-NEXT:    [[Z_TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Z]], i64 0, i64 2, !dbg [[DBG98:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[Z_TMP1]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG98]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[Z_TMP1]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG98]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_COPYLOAD:%.*]] = load i32, ptr [[Z_TMP1]], align 1, !dbg [[DBG99:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X:%.*]], i32 [[A_SROA_0_8_COPYLOAD]], i32 2, !dbg [[DBG99]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[A_SROA_0_8_VEC_INSERT]], i32 2, !dbg [[DBG100:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META87:![0-9]+]], metadata !DIExpression()), !dbg [[DBG100]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META88:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META87:![0-9]+]], metadata !DIExpression()), !dbg [[DBG100]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META88:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_3_28_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[A_SROA_3_16_COPYLOAD]], i32 3, !dbg [[DBG102:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_3_28_VEC_EXTRACT]], metadata [[META89:![0-9]+]], metadata !DIExpression()), !dbg [[DBG102]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG103:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_3_28_VEC_EXTRACT]], metadata [[META89:![0-9]+]], metadata !DIExpression()), !dbg [[DBG102]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG103:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_3_16_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[A_SROA_3_16_COPYLOAD]], i32 0, !dbg [[DBG104:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_3_16_VEC_EXTRACT]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG104]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_3_16_VEC_EXTRACT]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG104]]
 ; DEBUG-NEXT:    [[TMP4:%.*]] = add i32 [[A_SROA_0_8_VEC_EXTRACT]], [[A_SROA_3_28_VEC_EXTRACT]], !dbg [[DBG105:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG105]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG105]]
 ; DEBUG-NEXT:    [[TMP5:%.*]] = add i32 [[A_SROA_3_16_VEC_EXTRACT]], [[TMP4]], !dbg [[DBG106:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG106]]
 ; DEBUG-NEXT:    ret i32 [[TMP5]], !dbg [[DBG107:![0-9]+]]
 ;
 entry:
@@ -243,26 +243,26 @@ define i32 @test4_as1(<4 x i32> %x, <4 x i32> %y, ptr addrspace(1) %z) {
 ;
 ; DEBUG-LABEL: @test4_as1(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META111:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META111:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_3_16_COPYLOAD:%.*]] = load <4 x i32>, ptr addrspace(1) [[Z:%.*]], align 1, !dbg [[DBG123:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META112:![0-9]+]], metadata !DIExpression()), !dbg [[DBG124:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META112:![0-9]+]], metadata !DIExpression()), !dbg [[DBG124:![0-9]+]]
 ; DEBUG-NEXT:    [[Z_TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[Z]], i16 0, i16 2, !dbg [[DBG125:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr addrspace(1) [[Z_TMP1]], metadata [[META113:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr addrspace(1) [[Z_TMP1]], metadata [[META113:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_COPYLOAD:%.*]] = load i32, ptr addrspace(1) [[Z_TMP1]], align 1, !dbg [[DBG126:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X:%.*]], i32 [[A_SROA_0_8_COPYLOAD]], i32 2, !dbg [[DBG126]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[A_SROA_0_8_VEC_INSERT]], i32 2, !dbg [[DBG127:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META114:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META115:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META114:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META115:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_3_28_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[A_SROA_3_16_COPYLOAD]], i32 3, !dbg [[DBG129:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_3_28_VEC_EXTRACT]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG129]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_3_28_VEC_EXTRACT]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG129]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_3_16_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[A_SROA_3_16_COPYLOAD]], i32 0, !dbg [[DBG131:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_3_16_VEC_EXTRACT]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG131]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_3_16_VEC_EXTRACT]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG131]]
 ; DEBUG-NEXT:    [[TMP4:%.*]] = add i32 [[A_SROA_0_8_VEC_EXTRACT]], [[A_SROA_3_28_VEC_EXTRACT]], !dbg [[DBG132:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG132]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG132]]
 ; DEBUG-NEXT:    [[TMP5:%.*]] = add i32 [[A_SROA_3_16_VEC_EXTRACT]], [[TMP4]], !dbg [[DBG133:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]]
 ; DEBUG-NEXT:    ret i32 [[TMP5]], !dbg [[DBG134:![0-9]+]]
 ;
 entry:
@@ -305,25 +305,25 @@ define i32 @test5(<4 x i32> %x, <4 x i32> %y, ptr %z) {
 ;
 ; DEBUG-LABEL: @test5(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META139:![0-9]+]], metadata !DIExpression()), !dbg [[DBG150:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META139:![0-9]+]], metadata !DIExpression()), !dbg [[DBG150:![0-9]+]]
 ; DEBUG-NEXT:    [[Z_TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Z:%.*]], i64 0, i64 2, !dbg [[DBG151:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[Z_TMP1]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[Z_TMP1]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_VEC_EXTRACT3:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2, !dbg [[DBG152:![0-9]+]]
 ; DEBUG-NEXT:    store i32 [[A_SROA_0_8_VEC_EXTRACT3]], ptr [[Z_TMP1]], align 1, !dbg [[DBG152]]
 ; DEBUG-NEXT:    [[A_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[Y]], i32 2, !dbg [[DBG153:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META141:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_0_8_VEC_EXTRACT]], metadata [[META141:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_4_12_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[Y]], i32 3, !dbg [[DBG155:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_4_12_VEC_EXTRACT]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_4_12_VEC_EXTRACT]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_4_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[Y]], i32 0, !dbg [[DBG157:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_4_0_VEC_EXTRACT]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG157]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_4_0_VEC_EXTRACT]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG157]]
 ; DEBUG-NEXT:    [[TMP4:%.*]] = add i32 [[A_SROA_0_8_VEC_EXTRACT]], [[A_SROA_4_12_VEC_EXTRACT]], !dbg [[DBG158:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158]]
 ; DEBUG-NEXT:    [[TMP5:%.*]] = add i32 [[A_SROA_4_0_VEC_EXTRACT]], [[TMP4]], !dbg [[DBG159:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META147:![0-9]+]], metadata !DIExpression()), !dbg [[DBG159]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META147:![0-9]+]], metadata !DIExpression()), !dbg [[DBG159]]
 ; DEBUG-NEXT:    ret i32 [[TMP5]], !dbg [[DBG160:![0-9]+]]
 ;
 entry:
@@ -367,17 +367,17 @@ define i64 @test6(<4 x i64> %x, <4 x i64> %y, i64 %n) {
 ;
 ; DEBUG-LABEL: @test6(
 ; DEBUG-NEXT:    [[TMP:%.*]] = alloca { <4 x i64>, <4 x i64> }, align 32, !dbg [[DBG168:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[TMP]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG168]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[TMP]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG168]]
 ; DEBUG-NEXT:    [[P0:%.*]] = getelementptr inbounds { <4 x i64>, <4 x i64> }, ptr [[TMP]], i32 0, i32 0, !dbg [[DBG169:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[P0]], metadata [[META164:![0-9]+]], metadata !DIExpression()), !dbg [[DBG169]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[P0]], metadata [[META164:![0-9]+]], metadata !DIExpression()), !dbg [[DBG169]]
 ; DEBUG-NEXT:    store <4 x i64> [[X:%.*]], ptr [[P0]], align 32, !dbg [[DBG170:![0-9]+]]
 ; DEBUG-NEXT:    [[P1:%.*]] = getelementptr inbounds { <4 x i64>, <4 x i64> }, ptr [[TMP]], i32 0, i32 1, !dbg [[DBG171:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[P1]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[P1]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
 ; DEBUG-NEXT:    store <4 x i64> [[Y:%.*]], ptr [[P1]], align 32, !dbg [[DBG172:![0-9]+]]
 ; DEBUG-NEXT:    [[ADDR:%.*]] = getelementptr inbounds { <4 x i64>, <4 x i64> }, ptr [[TMP]], i32 0, i32 0, i64 [[N:%.*]], !dbg [[DBG173:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[ADDR]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[ADDR]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173]]
 ; DEBUG-NEXT:    [[RES:%.*]] = load i64, ptr [[ADDR]], align 4, !dbg [[DBG174:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i64 [[RES]], metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG174]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[RES]], metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG174]]
 ; DEBUG-NEXT:    ret i64 [[RES]], !dbg [[DBG175:![0-9]+]]
 ;
   %tmp = alloca { <4 x i64>, <4 x i64> }
@@ -401,15 +401,15 @@ define <4 x i32> @test_subvec_store() {
 ;
 ; DEBUG-LABEL: @test_subvec_store(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184:![0-9]+]]
 ; DEBUG-NEXT:    [[A_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>, <4 x i32> undef, !dbg [[DBG185:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]]
 ; DEBUG-NEXT:    [[A_4_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>, <4 x i32> [[A_0_VECBLEND]], !dbg [[DBG187:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG188:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG188:![0-9]+]]
 ; DEBUG-NEXT:    [[A_8_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> <i32 undef, i32 undef, i32 2, i32 2>, <4 x i32> [[A_4_VECBLEND]], !dbg [[DBG189:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META181:![0-9]+]], metadata !DIExpression()), !dbg [[DBG190:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META181:![0-9]+]], metadata !DIExpression()), !dbg [[DBG190:![0-9]+]]
 ; DEBUG-NEXT:    [[A_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[A_8_VECBLEND]], i32 3, i32 3, !dbg [[DBG191:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[A_12_VEC_INSERT]], metadata [[META182:![0-9]+]], metadata !DIExpression()), !dbg [[DBG192:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x i32> [[A_12_VEC_INSERT]], metadata [[META182:![0-9]+]], metadata !DIExpression()), !dbg [[DBG192:![0-9]+]]
 ; DEBUG-NEXT:    ret <4 x i32> [[A_12_VEC_INSERT]], !dbg [[DBG193:![0-9]+]]
 ;
 entry:
@@ -443,19 +443,19 @@ define <4 x i32> @test_subvec_load() {
 ;
 ; DEBUG-LABEL: @test_subvec_load(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG204:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG204:![0-9]+]]
 ; DEBUG-NEXT:    [[A_0_VEC_EXTRACT:%.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> poison, <2 x i32> <i32 0, i32 1>, !dbg [[DBG205:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <2 x i32> [[A_0_VEC_EXTRACT]], metadata [[META197:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG206:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <2 x i32> [[A_0_VEC_EXTRACT]], metadata [[META197:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG206:![0-9]+]]
 ; DEBUG-NEXT:    [[A_4_VEC_EXTRACT:%.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> poison, <2 x i32> <i32 1, i32 2>, !dbg [[DBG207:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <2 x i32> [[A_4_VEC_EXTRACT]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META200:![0-9]+]], metadata !DIExpression()), !dbg [[DBG208:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <2 x i32> [[A_4_VEC_EXTRACT]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META200:![0-9]+]], metadata !DIExpression()), !dbg [[DBG208:![0-9]+]]
 ; DEBUG-NEXT:    [[A_8_VEC_EXTRACT:%.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> poison, <2 x i32> <i32 2, i32 3>, !dbg [[DBG209:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <2 x i32> [[A_8_VEC_EXTRACT]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <2 x i32> [[A_8_VEC_EXTRACT]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209]]
 ; DEBUG-NEXT:    [[TMP:%.*]] = shufflevector <2 x i32> [[A_0_VEC_EXTRACT]], <2 x i32> [[A_4_VEC_EXTRACT]], <2 x i32> <i32 0, i32 2>, !dbg [[DBG210:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <2 x i32> [[TMP]], metadata [[META202:![0-9]+]], metadata !DIExpression()), !dbg [[DBG210]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <2 x i32> [[TMP]], metadata [[META202:![0-9]+]], metadata !DIExpression()), !dbg [[DBG210]]
 ; DEBUG-NEXT:    [[RET:%.*]] = shufflevector <2 x i32> [[TMP]], <2 x i32> [[A_8_VEC_EXTRACT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG211:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[RET]], metadata [[META203:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x i32> [[RET]], metadata [[META203:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]]
 ; DEBUG-NEXT:    ret <4 x i32> [[RET]], !dbg [[DBG212:![0-9]+]]
 ;
 entry:
@@ -488,15 +488,15 @@ define <4 x float> @test_subvec_memset() {
 ;
 ; DEBUG-LABEL: @test_subvec_memset(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG220:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG220:![0-9]+]]
 ; DEBUG-NEXT:    [[A_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef>, <4 x float> undef, !dbg [[DBG221:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META216:![0-9]+]], metadata !DIExpression()), !dbg [[DBG222:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META216:![0-9]+]], metadata !DIExpression()), !dbg [[DBG222:![0-9]+]]
 ; DEBUG-NEXT:    [[A_4_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x float> <float undef, float 0x3820202020000000, float 0x3820202020000000, float undef>, <4 x float> [[A_0_VECBLEND]], !dbg [[DBG223:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]]
 ; DEBUG-NEXT:    [[A_8_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> <float undef, float undef, float 0x3860606060000000, float 0x3860606060000000>, <4 x float> [[A_4_VECBLEND]], !dbg [[DBG225:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META218:![0-9]+]], metadata !DIExpression()), !dbg [[DBG226:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META218:![0-9]+]], metadata !DIExpression()), !dbg [[DBG226:![0-9]+]]
 ; DEBUG-NEXT:    [[A_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[A_8_VECBLEND]], float 0x38E0E0E0E0000000, i32 3, !dbg [[DBG227:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x float> [[A_12_VEC_INSERT]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG228:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x float> [[A_12_VEC_INSERT]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG228:![0-9]+]]
 ; DEBUG-NEXT:    ret <4 x float> [[A_12_VEC_INSERT]], !dbg [[DBG229:![0-9]+]]
 ;
 entry:
@@ -538,24 +538,24 @@ define <4 x float> @test_subvec_memcpy(ptr %x, ptr %y, ptr %z, ptr %f, ptr %out)
 ;
 ; DEBUG-LABEL: @test_subvec_memcpy(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG237:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG237:![0-9]+]]
 ; DEBUG-NEXT:    [[A_0_COPYLOAD:%.*]] = load <2 x float>, ptr [[X:%.*]], align 1, !dbg [[DBG238:![0-9]+]]
 ; DEBUG-NEXT:    [[A_0_VEC_EXPAND:%.*]] = shufflevector <2 x float> [[A_0_COPYLOAD]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>, !dbg [[DBG238]]
 ; DEBUG-NEXT:    [[A_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> [[A_0_VEC_EXPAND]], <4 x float> undef, !dbg [[DBG238]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META233:![0-9]+]], metadata !DIExpression()), !dbg [[DBG239:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META233:![0-9]+]], metadata !DIExpression()), !dbg [[DBG239:![0-9]+]]
 ; DEBUG-NEXT:    [[A_4_COPYLOAD:%.*]] = load <2 x float>, ptr [[Y:%.*]], align 1, !dbg [[DBG240:![0-9]+]]
 ; DEBUG-NEXT:    [[A_4_VEC_EXPAND:%.*]] = shufflevector <2 x float> [[A_4_COPYLOAD]], <2 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>, !dbg [[DBG240]]
 ; DEBUG-NEXT:    [[A_4_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x float> [[A_4_VEC_EXPAND]], <4 x float> [[A_0_VECBLEND]], !dbg [[DBG240]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241:![0-9]+]]
 ; DEBUG-NEXT:    [[A_8_COPYLOAD:%.*]] = load <2 x float>, ptr [[Z:%.*]], align 1, !dbg [[DBG242:![0-9]+]]
 ; DEBUG-NEXT:    [[A_8_VEC_EXPAND:%.*]] = shufflevector <2 x float> [[A_8_COPYLOAD]], <2 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>, !dbg [[DBG242]]
 ; DEBUG-NEXT:    [[A_8_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> [[A_8_VEC_EXPAND]], <4 x float> [[A_4_VECBLEND]], !dbg [[DBG242]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META235:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META235:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243:![0-9]+]]
 ; DEBUG-NEXT:    [[A_12_COPYLOAD:%.*]] = load float, ptr [[F:%.*]], align 1, !dbg [[DBG244:![0-9]+]]
 ; DEBUG-NEXT:    [[A_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[A_8_VECBLEND]], float [[A_12_COPYLOAD]], i32 3, !dbg [[DBG244]]
 ; DEBUG-NEXT:    [[A_8_VEC_EXTRACT:%.*]] = shufflevector <4 x float> [[A_12_VEC_INSERT]], <4 x float> poison, <2 x i32> <i32 2, i32 3>, !dbg [[DBG245:![0-9]+]]
 ; DEBUG-NEXT:    store <2 x float> [[A_8_VEC_EXTRACT]], ptr [[OUT:%.*]], align 1, !dbg [[DBG245]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x float> [[A_12_VEC_INSERT]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x float> [[A_12_VEC_INSERT]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
 ; DEBUG-NEXT:    ret <4 x float> [[A_12_VEC_INSERT]], !dbg [[DBG247:![0-9]+]]
 ;
 entry:
@@ -596,7 +596,7 @@ define i32 @PR14212(<3 x i8> %val) {
 ;
 ; DEBUG-LABEL: @PR14212(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast <3 x i8> [[VAL:%.*]] to i24, !dbg [[DBG253:![0-9]+]]
 ; DEBUG-NEXT:    [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i8 undef to i32, !dbg [[DBG254:![0-9]+]]
 ; DEBUG-NEXT:    [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl i32 [[RETVAL_SROA_2_0_INSERT_EXT]], 24, !dbg [[DBG254]]
@@ -605,7 +605,7 @@ define i32 @PR14212(<3 x i8> %val) {
 ; DEBUG-NEXT:    [[RETVAL_0_INSERT_EXT:%.*]] = zext i24 [[TMP0]] to i32, !dbg [[DBG254]]
 ; DEBUG-NEXT:    [[RETVAL_0_INSERT_MASK:%.*]] = and i32 [[RETVAL_SROA_2_0_INSERT_INSERT]], -16777216, !dbg [[DBG254]]
 ; DEBUG-NEXT:    [[RETVAL_0_INSERT_INSERT:%.*]] = or i32 [[RETVAL_0_INSERT_MASK]], [[RETVAL_0_INSERT_EXT]], !dbg [[DBG254]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[RETVAL_0_INSERT_INSERT]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG253]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[RETVAL_0_INSERT_INSERT]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG253]]
 ; DEBUG-NEXT:    ret i32 [[RETVAL_0_INSERT_INSERT]], !dbg [[DBG254]]
 ;
 entry:
@@ -630,12 +630,12 @@ define <2 x i8> @PR14349.1(i32 %x) {
 ;
 ; DEBUG-LABEL: @PR14349.1(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META257:![0-9]+]], metadata !DIExpression()), !dbg [[DBG260:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META257:![0-9]+]], metadata !DIExpression()), !dbg [[DBG260:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[X:%.*]] to i16, !dbg [[DBG261:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast i16 [[A_SROA_0_0_EXTRACT_TRUNC]] to <2 x i8>, !dbg [[DBG261]]
 ; DEBUG-NEXT:    [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[X]], 16, !dbg [[DBG261]]
 ; DEBUG-NEXT:    [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[A_SROA_2_0_EXTRACT_SHIFT]] to i16, !dbg [[DBG261]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <2 x i8> [[TMP0]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <2 x i8> [[TMP0]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262:![0-9]+]]
 ; DEBUG-NEXT:    ret <2 x i8> [[TMP0]], !dbg [[DBG263:![0-9]+]]
 ;
 entry:
@@ -666,7 +666,7 @@ define i32 @PR14349.2(<2 x i8> %x) {
 ;
 ; DEBUG-LABEL: @PR14349.2(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast <2 x i8> [[X:%.*]] to i16, !dbg [[DBG269:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_2_0_INSERT_EXT:%.*]] = zext i16 undef to i32, !dbg [[DBG270:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_2_0_INSERT_SHIFT:%.*]] = shl i32 [[A_SROA_2_0_INSERT_EXT]], 16, !dbg [[DBG270]]
@@ -675,7 +675,7 @@ define i32 @PR14349.2(<2 x i8> %x) {
 ; DEBUG-NEXT:    [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP0]] to i32, !dbg [[DBG270]]
 ; DEBUG-NEXT:    [[A_SROA_0_0_INSERT_MASK:%.*]] = and i32 [[A_SROA_2_0_INSERT_INSERT]], -65536, !dbg [[DBG270]]
 ; DEBUG-NEXT:    [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i32 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_EXT]], !dbg [[DBG270]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_0_0_INSERT_INSERT]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG269]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_0_0_INSERT_INSERT]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG269]]
 ; DEBUG-NEXT:    ret i32 [[A_SROA_0_0_INSERT_INSERT]], !dbg [[DBG270]]
 ;
 entry:
@@ -702,21 +702,21 @@ define i32 @test7(<2 x i32> %x, <2 x i32> %y) {
 ;
 ; DEBUG-LABEL: @test7(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG284:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META275:![0-9]+]], metadata !DIExpression()), !dbg [[DBG285:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG284:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META275:![0-9]+]], metadata !DIExpression()), !dbg [[DBG285:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 1, !dbg [[DBG286:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_0_4_VEC_EXTRACT]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG286]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG287:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_0_4_VEC_EXTRACT]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG286]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG287:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_2_12_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[Y:%.*]], i32 1, !dbg [[DBG288:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_2_12_VEC_EXTRACT]], metadata [[META278:![0-9]+]], metadata !DIExpression()), !dbg [[DBG288]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG289:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_2_12_VEC_EXTRACT]], metadata [[META278:![0-9]+]], metadata !DIExpression()), !dbg [[DBG288]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG289:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_2_8_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[Y]], i32 0, !dbg [[DBG290:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_2_8_VEC_EXTRACT]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG290]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_2_8_VEC_EXTRACT]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG290]]
 ; DEBUG-NEXT:    [[TMP4:%.*]] = add i32 [[A_SROA_0_4_VEC_EXTRACT]], [[A_SROA_2_12_VEC_EXTRACT]], !dbg [[DBG291:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG291]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG291]]
 ; DEBUG-NEXT:    [[TMP5:%.*]] = add i32 [[A_SROA_2_8_VEC_EXTRACT]], [[TMP4]], !dbg [[DBG292:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG292]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP5]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG292]]
 ; DEBUG-NEXT:    ret i32 [[TMP5]], !dbg [[DBG293:![0-9]+]]
 ;
 entry:
@@ -751,14 +751,14 @@ define i32 @test8(<2 x i32> %x) {
 ;
 ; DEBUG-LABEL: @test8(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META296:![0-9]+]], metadata !DIExpression()), !dbg [[DBG301:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META296:![0-9]+]], metadata !DIExpression()), !dbg [[DBG301:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 0, !dbg [[DBG302:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_0_0_VEC_EXTRACT]], metadata [[META297:![0-9]+]], metadata !DIExpression()), !dbg [[DBG302]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META298:![0-9]+]], metadata !DIExpression()), !dbg [[DBG303:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_0_0_VEC_EXTRACT]], metadata [[META297:![0-9]+]], metadata !DIExpression()), !dbg [[DBG302]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META298:![0-9]+]], metadata !DIExpression()), !dbg [[DBG303:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[X]], i32 1, !dbg [[DBG304:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[A_SROA_0_4_VEC_EXTRACT]], metadata [[META299:![0-9]+]], metadata !DIExpression()), !dbg [[DBG304]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[A_SROA_0_4_VEC_EXTRACT]], metadata [[META299:![0-9]+]], metadata !DIExpression()), !dbg [[DBG304]]
 ; DEBUG-NEXT:    [[TMP4:%.*]] = add i32 [[A_SROA_0_0_VEC_EXTRACT]], [[A_SROA_0_4_VEC_EXTRACT]], !dbg [[DBG305:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META300:![0-9]+]], metadata !DIExpression()), !dbg [[DBG305]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[TMP4]], metadata [[META300:![0-9]+]], metadata !DIExpression()), !dbg [[DBG305]]
 ; DEBUG-NEXT:    ret i32 [[TMP4]], !dbg [[DBG306:![0-9]+]]
 ;
 entry:
@@ -786,11 +786,11 @@ define <2 x i32> @test9(i32 %x, i32 %y) {
 ;
 ; DEBUG-LABEL: @test9(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META309:![0-9]+]], metadata !DIExpression()), !dbg [[DBG312:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META309:![0-9]+]], metadata !DIExpression()), !dbg [[DBG312:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i32> undef, i32 [[X:%.*]], i32 0, !dbg [[DBG313:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG314:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG314:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[A_SROA_0_0_VEC_INSERT]], i32 [[Y:%.*]], i32 1, !dbg [[DBG315:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <2 x i32> [[A_SROA_0_4_VEC_INSERT]], metadata [[META311:![0-9]+]], metadata !DIExpression()), !dbg [[DBG316:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <2 x i32> [[A_SROA_0_4_VEC_INSERT]], metadata [[META311:![0-9]+]], metadata !DIExpression()), !dbg [[DBG316:![0-9]+]]
 ; DEBUG-NEXT:    ret <2 x i32> [[A_SROA_0_4_VEC_INSERT]], !dbg [[DBG317:![0-9]+]]
 ;
 entry:
@@ -817,11 +817,11 @@ define <2 x i32> @test10(<4 x i16> %x, i32 %y) {
 ;
 ; DEBUG-LABEL: @test10(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META320:![0-9]+]], metadata !DIExpression()), !dbg [[DBG323:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META320:![0-9]+]], metadata !DIExpression()), !dbg [[DBG323:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[X:%.*]] to <2 x i32>, !dbg [[DBG324:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG325:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG325:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Y:%.*]], i32 1, !dbg [[DBG326:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <2 x i32> [[A_SROA_0_4_VEC_INSERT]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG327:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <2 x i32> [[A_SROA_0_4_VEC_INSERT]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG327:![0-9]+]]
 ; DEBUG-NEXT:    ret <2 x i32> [[A_SROA_0_4_VEC_INSERT]], !dbg [[DBG328:![0-9]+]]
 ;
 entry:
@@ -850,12 +850,12 @@ define <2 x float> @test11(<4 x i16> %x, i32 %y) {
 ;
 ; DEBUG-LABEL: @test11(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META331:![0-9]+]], metadata !DIExpression()), !dbg [[DBG334:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META331:![0-9]+]], metadata !DIExpression()), !dbg [[DBG334:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[X:%.*]] to <2 x i32>, !dbg [[DBG335:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG336:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG336:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Y:%.*]], i32 1, !dbg [[DBG337:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[A_SROA_0_4_VEC_INSERT]] to <2 x float>, !dbg [[DBG338:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <2 x float> [[TMP1]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG338]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <2 x float> [[TMP1]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG338]]
 ; DEBUG-NEXT:    ret <2 x float> [[TMP1]], !dbg [[DBG339:![0-9]+]]
 ;
 entry:
@@ -876,9 +876,9 @@ define <4 x float> @test12(<4 x i32> %val) {
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; DEBUG-LABEL: @test12(
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META342:![0-9]+]], metadata !DIExpression()), !dbg [[DBG344:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META342:![0-9]+]], metadata !DIExpression()), !dbg [[DBG344:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VAL:%.*]] to <4 x float>, !dbg [[DBG345:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x float> [[TMP1]], metadata [[META343:![0-9]+]], metadata !DIExpression()), !dbg [[DBG345]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x float> [[TMP1]], metadata [[META343:![0-9]+]], metadata !DIExpression()), !dbg [[DBG345]]
 ; DEBUG-NEXT:    ret <4 x float> [[TMP1]], !dbg [[DBG346:![0-9]+]]
 ;
   %a = alloca <3 x i32>, align 16
@@ -904,16 +904,16 @@ define <2 x i64> @test13(i32 %a, i32 %b, i32 %c, i32 %d) {
 ;
 ; DEBUG-LABEL: @test13(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META349:![0-9]+]], metadata !DIExpression()), !dbg [[DBG354:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META349:![0-9]+]], metadata !DIExpression()), !dbg [[DBG354:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0, !dbg [[DBG355:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META350:![0-9]+]], metadata !DIExpression()), !dbg [[DBG356:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META350:![0-9]+]], metadata !DIExpression()), !dbg [[DBG356:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_0_VEC_INSERT]], i32 [[B:%.*]], i32 1, !dbg [[DBG357:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META351:![0-9]+]], metadata !DIExpression()), !dbg [[DBG358:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META351:![0-9]+]], metadata !DIExpression()), !dbg [[DBG358:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_4_VEC_INSERT]], i32 [[C:%.*]], i32 2, !dbg [[DBG359:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META352:![0-9]+]], metadata !DIExpression()), !dbg [[DBG360:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META352:![0-9]+]], metadata !DIExpression()), !dbg [[DBG360:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_8_VEC_INSERT]], i32 [[D:%.*]], i32 3, !dbg [[DBG361:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[X_SROA_0_12_VEC_INSERT]] to <2 x i64>, !dbg [[DBG362:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <2 x i64> [[TMP0]], metadata [[META353:![0-9]+]], metadata !DIExpression()), !dbg [[DBG362]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <2 x i64> [[TMP0]], metadata [[META353:![0-9]+]], metadata !DIExpression()), !dbg [[DBG362]]
 ; DEBUG-NEXT:    ret <2 x i64> [[TMP0]], !dbg [[DBG363:![0-9]+]]
 ;
 entry:
@@ -946,26 +946,26 @@ define i32 @test14(<2 x i64> %x) {
 ;
 ; DEBUG-LABEL: @test14(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META366:![0-9]+]], metadata !DIExpression()), !dbg [[DBG378:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META366:![0-9]+]], metadata !DIExpression()), !dbg [[DBG378:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[X:%.*]] to <4 x i32>, !dbg [[DBG379:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META367:![0-9]+]], metadata !DIExpression()), !dbg [[DBG380:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META367:![0-9]+]], metadata !DIExpression()), !dbg [[DBG380:![0-9]+]]
 ; DEBUG-NEXT:    [[X_ADDR_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0, !dbg [[DBG381:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[X_ADDR_SROA_0_0_VEC_EXTRACT]], metadata [[META368:![0-9]+]], metadata !DIExpression()), !dbg [[DBG381]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META369:![0-9]+]], metadata !DIExpression()), !dbg [[DBG382:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[X_ADDR_SROA_0_0_VEC_EXTRACT]], metadata [[META368:![0-9]+]], metadata !DIExpression()), !dbg [[DBG381]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META369:![0-9]+]], metadata !DIExpression()), !dbg [[DBG382:![0-9]+]]
 ; DEBUG-NEXT:    [[X_ADDR_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1, !dbg [[DBG383:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[X_ADDR_SROA_0_4_VEC_EXTRACT]], metadata [[META370:![0-9]+]], metadata !DIExpression()), !dbg [[DBG383]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META371:![0-9]+]], metadata !DIExpression()), !dbg [[DBG384:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[X_ADDR_SROA_0_4_VEC_EXTRACT]], metadata [[META370:![0-9]+]], metadata !DIExpression()), !dbg [[DBG383]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META371:![0-9]+]], metadata !DIExpression()), !dbg [[DBG384:![0-9]+]]
 ; DEBUG-NEXT:    [[X_ADDR_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2, !dbg [[DBG385:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[X_ADDR_SROA_0_8_VEC_EXTRACT]], metadata [[META372:![0-9]+]], metadata !DIExpression()), !dbg [[DBG385]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META373:![0-9]+]], metadata !DIExpression()), !dbg [[DBG386:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[X_ADDR_SROA_0_8_VEC_EXTRACT]], metadata [[META372:![0-9]+]], metadata !DIExpression()), !dbg [[DBG385]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META373:![0-9]+]], metadata !DIExpression()), !dbg [[DBG386:![0-9]+]]
 ; DEBUG-NEXT:    [[X_ADDR_SROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3, !dbg [[DBG387:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[X_ADDR_SROA_0_12_VEC_EXTRACT]], metadata [[META374:![0-9]+]], metadata !DIExpression()), !dbg [[DBG387]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[X_ADDR_SROA_0_12_VEC_EXTRACT]], metadata [[META374:![0-9]+]], metadata !DIExpression()), !dbg [[DBG387]]
 ; DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[X_ADDR_SROA_0_0_VEC_EXTRACT]], [[X_ADDR_SROA_0_4_VEC_EXTRACT]], !dbg [[DBG388:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[ADD]], metadata [[META375:![0-9]+]], metadata !DIExpression()), !dbg [[DBG388]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[ADD]], metadata [[META375:![0-9]+]], metadata !DIExpression()), !dbg [[DBG388]]
 ; DEBUG-NEXT:    [[ADD1:%.*]] = add i32 [[X_ADDR_SROA_0_8_VEC_EXTRACT]], [[X_ADDR_SROA_0_12_VEC_EXTRACT]], !dbg [[DBG389:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[ADD1]], metadata [[META376:![0-9]+]], metadata !DIExpression()), !dbg [[DBG389]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[ADD1]], metadata [[META376:![0-9]+]], metadata !DIExpression()), !dbg [[DBG389]]
 ; DEBUG-NEXT:    [[ADD2:%.*]] = add i32 [[ADD]], [[ADD1]], !dbg [[DBG390:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i32 [[ADD2]], metadata [[META377:![0-9]+]], metadata !DIExpression()), !dbg [[DBG390]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[ADD2]], metadata [[META377:![0-9]+]], metadata !DIExpression()), !dbg [[DBG390]]
 ; DEBUG-NEXT:    ret i32 [[ADD2]], !dbg [[DBG391:![0-9]+]]
 ;
 entry:
@@ -1002,19 +1002,19 @@ define <4 x ptr> @test15(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; DEBUG-LABEL: @test15(
 ; DEBUG-NEXT:  entry:
 ; DEBUG-NEXT:    [[X_SROA_0:%.*]] = alloca <4 x ptr>, align 32, !dbg [[DBG400:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META394:![0-9]+]], metadata !DIExpression()), !dbg [[DBG400]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META394:![0-9]+]], metadata !DIExpression()), !dbg [[DBG400]]
 ; DEBUG-NEXT:    store i32 [[A:%.*]], ptr [[X_SROA_0]], align 32, !dbg [[DBG401:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META395:![0-9]+]], metadata !DIExpression()), !dbg [[DBG402:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META395:![0-9]+]], metadata !DIExpression()), !dbg [[DBG402:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_4_X_TMP2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 4, !dbg [[DBG403:![0-9]+]]
 ; DEBUG-NEXT:    store i32 [[B:%.*]], ptr [[X_SROA_0_4_X_TMP2_SROA_IDX1]], align 4, !dbg [[DBG403]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META396:![0-9]+]], metadata !DIExpression()), !dbg [[DBG404:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META396:![0-9]+]], metadata !DIExpression()), !dbg [[DBG404:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_8_X_TMP3_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 8, !dbg [[DBG405:![0-9]+]]
 ; DEBUG-NEXT:    store i32 [[C:%.*]], ptr [[X_SROA_0_8_X_TMP3_SROA_IDX2]], align 8, !dbg [[DBG405]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META397:![0-9]+]], metadata !DIExpression()), !dbg [[DBG406:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META397:![0-9]+]], metadata !DIExpression()), !dbg [[DBG406:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_12_X_TMP4_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 12, !dbg [[DBG407:![0-9]+]]
 ; DEBUG-NEXT:    store i32 [[D:%.*]], ptr [[X_SROA_0_12_X_TMP4_SROA_IDX3]], align 4, !dbg [[DBG407]]
 ; DEBUG-NEXT:    [[X_SROA_0_0_X_SROA_0_0_RESULT:%.*]] = load <4 x ptr>, ptr [[X_SROA_0]], align 32, !dbg [[DBG408:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]], metadata [[META398:![0-9]+]], metadata !DIExpression()), !dbg [[DBG408]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]], metadata [[META398:![0-9]+]], metadata !DIExpression()), !dbg [[DBG408]]
 ; DEBUG-NEXT:    ret <4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]], !dbg [[DBG409:![0-9]+]]
 ;
 entry:
@@ -1045,19 +1045,19 @@ define <4 x ptr> @test16(i64 %a, i64 %b, i64 %c, i64 %d) {
 ;
 ; DEBUG-LABEL: @test16(
 ; DEBUG-NEXT:  entry:
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META412:![0-9]+]], metadata !DIExpression()), !dbg [[DBG417:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META412:![0-9]+]], metadata !DIExpression()), !dbg [[DBG417:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[A:%.*]] to ptr, !dbg [[DBG418:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x ptr> undef, ptr [[TMP0]], i32 0, !dbg [[DBG418]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META413:![0-9]+]], metadata !DIExpression()), !dbg [[DBG419:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META413:![0-9]+]], metadata !DIExpression()), !dbg [[DBG419:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[B:%.*]] to ptr, !dbg [[DBG420:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x ptr> [[X_SROA_0_0_VEC_INSERT]], ptr [[TMP1]], i32 1, !dbg [[DBG420]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META414:![0-9]+]], metadata !DIExpression()), !dbg [[DBG421:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META414:![0-9]+]], metadata !DIExpression()), !dbg [[DBG421:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[C:%.*]] to ptr, !dbg [[DBG422:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_16_VEC_INSERT:%.*]] = insertelement <4 x ptr> [[X_SROA_0_8_VEC_INSERT]], ptr [[TMP2]], i32 2, !dbg [[DBG422]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META415:![0-9]+]], metadata !DIExpression()), !dbg [[DBG423:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META415:![0-9]+]], metadata !DIExpression()), !dbg [[DBG423:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[D:%.*]] to ptr, !dbg [[DBG424:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_24_VEC_INSERT:%.*]] = insertelement <4 x ptr> [[X_SROA_0_16_VEC_INSERT]], ptr [[TMP3]], i32 3, !dbg [[DBG424]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x ptr> [[X_SROA_0_24_VEC_INSERT]], metadata [[META416:![0-9]+]], metadata !DIExpression()), !dbg [[DBG425:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x ptr> [[X_SROA_0_24_VEC_INSERT]], metadata [[META416:![0-9]+]], metadata !DIExpression()), !dbg [[DBG425:![0-9]+]]
 ; DEBUG-NEXT:    ret <4 x ptr> [[X_SROA_0_24_VEC_INSERT]], !dbg [[DBG426:![0-9]+]]
 ;
 entry:
@@ -1090,19 +1090,19 @@ define <4 x ptr> @test17(i32 %a, i32 %b, i64 %c, i64 %d) {
 ; DEBUG-LABEL: @test17(
 ; DEBUG-NEXT:  entry:
 ; DEBUG-NEXT:    [[X_SROA_0:%.*]] = alloca <4 x ptr>, align 32, !dbg [[DBG434:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META429:![0-9]+]], metadata !DIExpression()), !dbg [[DBG434]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META429:![0-9]+]], metadata !DIExpression()), !dbg [[DBG434]]
 ; DEBUG-NEXT:    store i32 [[A:%.*]], ptr [[X_SROA_0]], align 32, !dbg [[DBG435:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META430:![0-9]+]], metadata !DIExpression()), !dbg [[DBG436:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META430:![0-9]+]], metadata !DIExpression()), !dbg [[DBG436:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_4_X_TMP2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 4, !dbg [[DBG437:![0-9]+]]
 ; DEBUG-NEXT:    store i32 [[B:%.*]], ptr [[X_SROA_0_4_X_TMP2_SROA_IDX1]], align 4, !dbg [[DBG437]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META431:![0-9]+]], metadata !DIExpression()), !dbg [[DBG438:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META431:![0-9]+]], metadata !DIExpression()), !dbg [[DBG438:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_16_X_TMP3_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 16, !dbg [[DBG439:![0-9]+]]
 ; DEBUG-NEXT:    store i64 [[C:%.*]], ptr [[X_SROA_0_16_X_TMP3_SROA_IDX2]], align 16, !dbg [[DBG439]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META432:![0-9]+]], metadata !DIExpression()), !dbg [[DBG440:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META432:![0-9]+]], metadata !DIExpression()), !dbg [[DBG440:![0-9]+]]
 ; DEBUG-NEXT:    [[X_SROA_0_24_X_TMP4_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 24, !dbg [[DBG441:![0-9]+]]
 ; DEBUG-NEXT:    store i64 [[D:%.*]], ptr [[X_SROA_0_24_X_TMP4_SROA_IDX3]], align 8, !dbg [[DBG441]]
 ; DEBUG-NEXT:    [[X_SROA_0_0_X_SROA_0_0_RESULT:%.*]] = load <4 x ptr>, ptr [[X_SROA_0]], align 32, !dbg [[DBG442:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]], metadata [[META433:![0-9]+]], metadata !DIExpression()), !dbg [[DBG442]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]], metadata [[META433:![0-9]+]], metadata !DIExpression()), !dbg [[DBG442]]
 ; DEBUG-NEXT:    ret <4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]], !dbg [[DBG443:![0-9]+]]
 ;
 entry:
@@ -1129,10 +1129,10 @@ define i1 @test18() {
 ;
 ; DEBUG-LABEL: @test18(
 ; DEBUG-NEXT:    [[A_SROA_0:%.*]] = alloca <2 x i64>, align 32, !dbg [[DBG449:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META446:![0-9]+]], metadata !DIExpression()), !dbg [[DBG449]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META446:![0-9]+]], metadata !DIExpression()), !dbg [[DBG449]]
 ; DEBUG-NEXT:    store <2 x i64> <i64 0, i64 -1>, ptr [[A_SROA_0]], align 32, !dbg [[DBG450:![0-9]+]]
 ; DEBUG-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load i1, ptr [[A_SROA_0]], align 32, !dbg [[DBG451:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata i1 [[A_SROA_0_0_A_SROA_0_0_L]], metadata [[META447:![0-9]+]], metadata !DIExpression()), !dbg [[DBG451]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata i1 [[A_SROA_0_0_A_SROA_0_0_L]], metadata [[META447:![0-9]+]], metadata !DIExpression()), !dbg [[DBG451]]
 ; DEBUG-NEXT:    ret i1 [[A_SROA_0_0_A_SROA_0_0_L]], !dbg [[DBG452:![0-9]+]]
 ;
   %a = alloca <8 x i32>
@@ -1149,7 +1149,7 @@ define void @swap-8bytes(ptr %x, ptr %y) {
 ; CHECK-NEXT:    ret void
 ;
 ; DEBUG-LABEL: @swap-8bytes(
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META455:![0-9]+]], metadata !DIExpression()), !dbg [[DBG456:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META455:![0-9]+]], metadata !DIExpression()), !dbg [[DBG456:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[X:%.*]], align 1, !dbg [[DBG457:![0-9]+]]
 ; DEBUG-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[X]], ptr [[Y:%.*]], i64 8, i1 false), !dbg [[DBG458:![0-9]+]]
 ; DEBUG-NEXT:    store i64 [[TMP_SROA_0_0_COPYLOAD]], ptr [[Y]], align 1, !dbg [[DBG459:![0-9]+]]
@@ -1172,7 +1172,7 @@ define void @swap-7bytes(ptr %x, ptr %y) {
 ;
 ; DEBUG-LABEL: @swap-7bytes(
 ; DEBUG-NEXT:    [[TMP:%.*]] = alloca [7 x i8], align 1, !dbg [[DBG464:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[TMP]], metadata [[META463:![0-9]+]], metadata !DIExpression()), !dbg [[DBG464]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[TMP]], metadata [[META463:![0-9]+]], metadata !DIExpression()), !dbg [[DBG464]]
 ; DEBUG-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[TMP]], ptr [[X:%.*]], i64 7, i1 false), !dbg [[DBG465:![0-9]+]]
 ; DEBUG-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[X]], ptr [[Y:%.*]], i64 7, i1 false), !dbg [[DBG466:![0-9]+]]
 ; DEBUG-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[Y]], ptr [[TMP]], i64 7, i1 false), !dbg [[DBG467:![0-9]+]]
@@ -1195,7 +1195,7 @@ define void @swap-16bytes(ptr %x, ptr %y) {
 ;
 ; DEBUG-LABEL: @swap-16bytes(
 ; DEBUG-NEXT:    [[TMP:%.*]] = alloca [2 x i64], align 8, !dbg [[DBG472:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[TMP]], metadata [[META471:![0-9]+]], metadata !DIExpression()), !dbg [[DBG472]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[TMP]], metadata [[META471:![0-9]+]], metadata !DIExpression()), !dbg [[DBG472]]
 ; DEBUG-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[TMP]], ptr [[X:%.*]], i64 16, i1 false), !dbg [[DBG473:![0-9]+]]
 ; DEBUG-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[X]], ptr [[Y:%.*]], i64 16, i1 false), !dbg [[DBG474:![0-9]+]]
 ; DEBUG-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[Y]], ptr [[TMP]], i64 16, i1 false), !dbg [[DBG475:![0-9]+]]
@@ -1218,7 +1218,7 @@ define void @swap-15bytes(ptr %x, ptr %y) {
 ;
 ; DEBUG-LABEL: @swap-15bytes(
 ; DEBUG-NEXT:    [[TMP:%.*]] = alloca [15 x i8], align 1, !dbg [[DBG480:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[TMP]], metadata [[META479:![0-9]+]], metadata !DIExpression()), !dbg [[DBG480]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[TMP]], metadata [[META479:![0-9]+]], metadata !DIExpression()), !dbg [[DBG480]]
 ; DEBUG-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[TMP]], ptr [[X:%.*]], i64 15, i1 false), !dbg [[DBG481:![0-9]+]]
 ; DEBUG-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[X]], ptr [[Y:%.*]], i64 15, i1 false), !dbg [[DBG482:![0-9]+]]
 ; DEBUG-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[Y]], ptr [[TMP]], i64 15, i1 false), !dbg [[DBG483:![0-9]+]]
@@ -1245,17 +1245,17 @@ define <4 x i32> @ptrLoadStoreTys(ptr %init, i32 %val2) {
 ;
 ; DEBUG-LABEL: @ptrLoadStoreTys(
 ; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG492:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META487:![0-9]+]], metadata !DIExpression()), !dbg [[DBG492]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META488:![0-9]+]], metadata !DIExpression()), !dbg [[DBG493:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META487:![0-9]+]], metadata !DIExpression()), !dbg [[DBG492]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META488:![0-9]+]], metadata !DIExpression()), !dbg [[DBG493:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VAL0]] to i64, !dbg [[DBG494:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>, !dbg [[DBG494]]
 ; DEBUG-NEXT:    [[OBJ_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>, !dbg [[DBG494]]
 ; DEBUG-NEXT:    [[OBJ_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> [[OBJ_0_VEC_EXPAND]], <4 x i32> zeroinitializer, !dbg [[DBG494]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META489:![0-9]+]], metadata !DIExpression()), !dbg [[DBG495:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META489:![0-9]+]], metadata !DIExpression()), !dbg [[DBG495:![0-9]+]]
 ; DEBUG-NEXT:    [[OBJ_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_0_VECBLEND]], i32 [[VAL2:%.*]], i32 2, !dbg [[DBG496:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META490:![0-9]+]], metadata !DIExpression()), !dbg [[DBG497:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META490:![0-9]+]], metadata !DIExpression()), !dbg [[DBG497:![0-9]+]]
 ; DEBUG-NEXT:    [[OBJ_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_8_VEC_INSERT]], i32 131072, i32 3, !dbg [[DBG498:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[OBJ_12_VEC_INSERT]], metadata [[META491:![0-9]+]], metadata !DIExpression()), !dbg [[DBG499:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x i32> [[OBJ_12_VEC_INSERT]], metadata [[META491:![0-9]+]], metadata !DIExpression()), !dbg [[DBG499:![0-9]+]]
 ; DEBUG-NEXT:    ret <4 x i32> [[OBJ_12_VEC_INSERT]], !dbg [[DBG500:![0-9]+]]
 ;
   %val0 = load ptr, ptr %init, align 8
@@ -1285,19 +1285,19 @@ define <4 x float> @ptrLoadStoreTysFloat(ptr %init, float %val2) {
 ;
 ; DEBUG-LABEL: @ptrLoadStoreTysFloat(
 ; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG508:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META503:![0-9]+]], metadata !DIExpression()), !dbg [[DBG508]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META503:![0-9]+]], metadata !DIExpression()), !dbg [[DBG508]]
 ; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x float>, align 16, !dbg [[DBG509:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META504:![0-9]+]], metadata !DIExpression()), !dbg [[DBG509]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META504:![0-9]+]], metadata !DIExpression()), !dbg [[DBG509]]
 ; DEBUG-NEXT:    store <4 x float> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG510:![0-9]+]]
 ; DEBUG-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG511:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META505:![0-9]+]], metadata !DIExpression()), !dbg [[DBG512:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META505:![0-9]+]], metadata !DIExpression()), !dbg [[DBG512:![0-9]+]]
 ; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG513:![0-9]+]]
 ; DEBUG-NEXT:    store float [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG513]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META506:![0-9]+]], metadata !DIExpression()), !dbg [[DBG514:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META506:![0-9]+]], metadata !DIExpression()), !dbg [[DBG514:![0-9]+]]
 ; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG515:![0-9]+]]
 ; DEBUG-NEXT:    store float 1.310720e+05, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG515]]
 ; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x float>, ptr [[OBJ]], align 16, !dbg [[DBG516:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x float> [[OBJ_0_SROAVAL]], metadata [[META507:![0-9]+]], metadata !DIExpression()), !dbg [[DBG516]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x float> [[OBJ_0_SROAVAL]], metadata [[META507:![0-9]+]], metadata !DIExpression()), !dbg [[DBG516]]
 ; DEBUG-NEXT:    ret <4 x float> [[OBJ_0_SROAVAL]], !dbg [[DBG517:![0-9]+]]
 ;
   %val0 = load ptr, ptr %init, align 8
@@ -1325,17 +1325,17 @@ define <4 x i32> @ptrLoadStoreTysAS3(ptr %init, i32 %val2) {
 ;
 ; DEBUG-LABEL: @ptrLoadStoreTysAS3(
 ; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr addrspace(3), ptr [[INIT:%.*]], align 8, !dbg [[DBG525:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr addrspace(3) [[VAL0]], metadata [[META520:![0-9]+]], metadata !DIExpression()), !dbg [[DBG525]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META521:![0-9]+]], metadata !DIExpression()), !dbg [[DBG526:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr addrspace(3) [[VAL0]], metadata [[META520:![0-9]+]], metadata !DIExpression()), !dbg [[DBG525]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META521:![0-9]+]], metadata !DIExpression()), !dbg [[DBG526:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[VAL0]] to i64, !dbg [[DBG527:![0-9]+]]
 ; DEBUG-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>, !dbg [[DBG527]]
 ; DEBUG-NEXT:    [[OBJ_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>, !dbg [[DBG527]]
 ; DEBUG-NEXT:    [[OBJ_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> [[OBJ_0_VEC_EXPAND]], <4 x i32> zeroinitializer, !dbg [[DBG527]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META522:![0-9]+]], metadata !DIExpression()), !dbg [[DBG528:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META522:![0-9]+]], metadata !DIExpression()), !dbg [[DBG528:![0-9]+]]
 ; DEBUG-NEXT:    [[OBJ_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_0_VECBLEND]], i32 [[VAL2:%.*]], i32 2, !dbg [[DBG529:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META523:![0-9]+]], metadata !DIExpression()), !dbg [[DBG530:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META523:![0-9]+]], metadata !DIExpression()), !dbg [[DBG530:![0-9]+]]
 ; DEBUG-NEXT:    [[OBJ_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_8_VEC_INSERT]], i32 131072, i32 3, !dbg [[DBG531:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[OBJ_12_VEC_INSERT]], metadata [[META524:![0-9]+]], metadata !DIExpression()), !dbg [[DBG532:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x i32> [[OBJ_12_VEC_INSERT]], metadata [[META524:![0-9]+]], metadata !DIExpression()), !dbg [[DBG532:![0-9]+]]
 ; DEBUG-NEXT:    ret <4 x i32> [[OBJ_12_VEC_INSERT]], !dbg [[DBG533:![0-9]+]]
 ;
   %val0 = load ptr addrspace(3), ptr %init, align 8
@@ -1365,19 +1365,19 @@ define <4 x ptr> @ptrLoadStoreTysPtr(ptr %init, i64 %val2) {
 ;
 ; DEBUG-LABEL: @ptrLoadStoreTysPtr(
 ; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG541:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META536:![0-9]+]], metadata !DIExpression()), !dbg [[DBG541]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META536:![0-9]+]], metadata !DIExpression()), !dbg [[DBG541]]
 ; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x ptr>, align 16, !dbg [[DBG542:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META537:![0-9]+]], metadata !DIExpression()), !dbg [[DBG542]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META537:![0-9]+]], metadata !DIExpression()), !dbg [[DBG542]]
 ; DEBUG-NEXT:    store <4 x ptr> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG543:![0-9]+]]
 ; DEBUG-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG544:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META538:![0-9]+]], metadata !DIExpression()), !dbg [[DBG545:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META538:![0-9]+]], metadata !DIExpression()), !dbg [[DBG545:![0-9]+]]
 ; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG546:![0-9]+]]
 ; DEBUG-NEXT:    store i64 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG546]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META539:![0-9]+]], metadata !DIExpression()), !dbg [[DBG547:![0-9]+]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata ptr undef, metadata [[META539:![0-9]+]], metadata !DIExpression()), !dbg [[DBG547:![0-9]+]]
 ; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG548:![0-9]+]]
 ; DEBUG-NEXT:    store i64 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG548]]
 ; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x ptr>, ptr [[OBJ]], align 16, !dbg [[DBG549:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x ptr> [[OBJ_0_SROAVAL]], metadata [[META540:![0-9]+]], metadata !DIExpression()), !dbg [[DBG549]]
+; DEBUG-NEXT:    tail call void @llvm.dbg.value(metadata <4 x ptr> [[OBJ_0_SROAVAL]], metadata [[META540:![0-9]+]], metadata !DIExpression()), !dbg [[DBG549]]
 ; DEBUG-NEXT:    ret <4 x ptr> [[OBJ_0_SROAVAL]], !dbg [[DBG550:![0-9]+]]
 ;
   %val0 = load ptr, ptr %init, align 8

From 1ca65dd74a4370e5788c69e939106b53da13dbf6 Mon Sep 17 00:00:00 2001
From: Yinying Li <107574043+yinying-lisa-li@users.noreply.github.com>
Date: Thu, 29 Feb 2024 14:14:31 -0500
Subject: [PATCH 53/55] [mlir][sparse] Migration to sparse_tensor.print
 (#83377)

Continuous efforts following #83357.
---
 .../SparseTensor/CPU/sparse_loose.mlir        |  23 +-
 .../SparseTensor/CPU/sparse_matmul.mlir       | 237 ++++++++++--------
 .../SparseTensor/CPU/sparse_matmul_slice.mlir |  84 +++----
 .../SparseTensor/CPU/sparse_matrix_ops.mlir   | 114 +++++----
 4 files changed, 259 insertions(+), 199 deletions(-)

diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_loose.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_loose.mlir
index 228d4e5f6f8a1a..e1f062121b12f9 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_loose.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_loose.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -28,7 +28,7 @@
 }>
 
 module {
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %f0 = arith.constant 0.0 : f64
     %d = arith.constant dense<[[ 1.0,  2.0,  3.0,  4.0 ],
@@ -39,19 +39,14 @@ module {
     %s = sparse_tensor.convert %d : tensor<5x4xf64> to tensor<5x4xf64, #CSR_hi>
 
     //
-    // CHECK:      ( 0, 4, 4, 8, 8, 9, 9, 13 )
-    // CHECK-NEXT: ( 0, 1, 2, 3, 0, 1, 2, 3, 2, 0, 1, 2, 3, 0, 1, 2, 3 )
-    // CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8, 5.5, 9, 10, 11, 12, 13, 14, 15, 16 )
+    // CHECK:   ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 17
+    // CHECK-NEXT: pos[1] : ( 0, 4, 4, 8, 8, 9, 9, 13
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 2, 0, 1, 2, 3, 0, 1, 2, 3
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 5.5, 9, 10, 11, 12, 13, 14, 15, 16
+    // CHECK-NEXT: ----
     //
-    %pos = sparse_tensor.positions %s {level = 1 : index } : tensor<5x4xf64, #CSR_hi> to memref<?xindex>
-    %vecp = vector.transfer_read %pos[%c0], %c0 : memref<?xindex>, vector<8xindex>
-    vector.print %vecp : vector<8xindex>
-    %crd = sparse_tensor.coordinates %s {level = 1 : index } : tensor<5x4xf64, #CSR_hi> to memref<?xindex>
-    %vecc = vector.transfer_read %crd[%c0], %c0 : memref<?xindex>, vector<17xindex>
-    vector.print %vecc : vector<17xindex>
-    %val = sparse_tensor.values %s : tensor<5x4xf64, #CSR_hi> to memref<?xf64>
-    %vecv = vector.transfer_read %val[%c0], %f0 : memref<?xf64>, vector<17xf64>
-    vector.print %vecv : vector<17xf64>
+    sparse_tensor.print %s : tensor<5x4xf64, #CSR_hi>
 
     // Release the resources.
     bufferization.dealloc_tensor %s: tensor<5x4xf64, #CSR_hi>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
index fa0dbac269b926..863e1c62370e32 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -90,7 +90,7 @@ module {
   //
   // Main driver.
   //
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
 
     // Initialize various matrices, dense for stress testing,
@@ -140,33 +140,94 @@ module {
     %b4 = sparse_tensor.convert %sb : tensor<8x4xf64> to tensor<8x4xf64, #DCSR>
 
     //
-    // Sanity check on stored entries before going into the computations.
-    //
-    // CHECK:      32
-    // CHECK-NEXT: 32
-    // CHECK-NEXT: 4
-    // CHECK-NEXT: 4
-    // CHECK-NEXT: 32
-    // CHECK-NEXT: 32
-    // CHECK-NEXT: 8
-    // CHECK-NEXT: 8
-    //
-    %noea1 = sparse_tensor.number_of_entries %a1 : tensor<4x8xf64, #CSR>
-    %noea2 = sparse_tensor.number_of_entries %a2 : tensor<4x8xf64, #DCSR>
-    %noea3 = sparse_tensor.number_of_entries %a3 : tensor<4x8xf64, #CSR>
-    %noea4 = sparse_tensor.number_of_entries %a4 : tensor<4x8xf64, #DCSR>
-    %noeb1 = sparse_tensor.number_of_entries %b1 : tensor<8x4xf64, #CSR>
-    %noeb2 = sparse_tensor.number_of_entries %b2 : tensor<8x4xf64, #DCSR>
-    %noeb3 = sparse_tensor.number_of_entries %b3 : tensor<8x4xf64, #CSR>
-    %noeb4 = sparse_tensor.number_of_entries %b4 : tensor<8x4xf64, #DCSR>
-    vector.print %noea1 : index
-    vector.print %noea2 : index
-    vector.print %noea3 : index
-    vector.print %noea4 : index
-    vector.print %noeb1 : index
-    vector.print %noeb2 : index
-    vector.print %noeb3 : index
-    vector.print %noeb4 : index
+    // Sanity check before going into the computations.
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 32
+    // CHECK-NEXT: pos[1] : ( 0, 8, 16, 24, 32
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+    // CHECK-NEXT: values : ( 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %a1 : tensor<4x8xf64, #CSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 32
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 8, 16, 24, 32
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+    // CHECK-NEXT: values : ( 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %a2 : tensor<4x8xf64, #DCSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 4
+    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 3, 4
+    // CHECK-NEXT: crd[1] : ( 1, 5, 1, 7
+    // CHECK-NEXT: values : ( 2.1, 6.1, 2.3, 1
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %a3 : tensor<4x8xf64, #CSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 4
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4
+    // CHECK-NEXT: crd[1] : ( 1, 5, 1, 7
+    // CHECK-NEXT: values : ( 2.1, 6.1, 2.3, 1
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %a4 : tensor<4x8xf64, #DCSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 32
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16, 20, 24, 28, 32
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+    // CHECK-NEXT: values : ( 10.1, 11.1, 12.1, 13.1, 10.2, 11.2, 12.2, 13.2, 10.3, 11.3, 12.3, 13.3, 10.4, 11.4, 12.4, 13.4, 10.5, 11.5, 12.5, 13.5, 10.6, 11.6, 12.6, 13.6, 10.7, 11.7, 12.7, 13.7, 10.8, 11.8, 12.8, 13.8
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %b1 : tensor<8x4xf64, #CSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 32
+    // CHECK-NEXT: pos[0] : ( 0, 8
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16, 20, 24, 28, 32
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+    // CHECK-NEXT: values : ( 10.1, 11.1, 12.1, 13.1, 10.2, 11.2, 12.2, 13.2, 10.3, 11.3, 12.3, 13.3, 10.4, 11.4, 12.4, 13.4, 10.5, 11.5, 12.5, 13.5, 10.6, 11.6, 12.6, 13.6, 10.7, 11.7, 12.7, 13.7, 10.8, 11.8, 12.8, 13.8
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %b2 : tensor<8x4xf64, #DCSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 8
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 3, 4, 4, 5, 6, 8
+    // CHECK-NEXT: crd[1] : ( 3, 2, 1, 0, 1, 2, 2, 3
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %b3 : tensor<8x4xf64, #CSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 8
+    // CHECK-NEXT: pos[0] : ( 0, 7
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 5, 6, 7
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 3, 4, 5, 6, 8
+    // CHECK-NEXT: crd[1] : ( 3, 2, 1, 0, 1, 2, 2, 3
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %b4 : tensor<8x4xf64, #DCSR>
 
     // Call kernels with dense.
     %0 = call @matmul1(%da, %db, %zero)
@@ -208,24 +269,26 @@ module {
     call @printMemrefF64(%u0) : (tensor<*xf64>) -> ()
 
     //
-    // CHECK:      {{\[}}[388.76,   425.56,   462.36,   499.16],
-    // CHECK-NEXT: [397.12,   434.72,   472.32,   509.92],
-    // CHECK-NEXT: [405.48,   443.88,   482.28,   520.68],
-    // CHECK-NEXT: [413.84,   453.04,   492.24,   531.44]]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 16
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+    // CHECK-NEXT: values : ( 388.76, 425.56, 462.36, 499.16, 397.12, 434.72, 472.32, 509.92, 405.48, 443.88, 482.28, 520.68, 413.84, 453.04, 492.24, 531.44
+    // CHECK-NEXT: ----
     //
-    %c1 = sparse_tensor.convert %1 : tensor<4x4xf64, #CSR> to tensor<4x4xf64>
-    %c1u = tensor.cast %c1 : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c1u) : (tensor<*xf64>) -> ()
+    sparse_tensor.print %1 : tensor<4x4xf64, #CSR>
 
     //
-    // CHECK:      {{\[}}[388.76,   425.56,   462.36,   499.16],
-    // CHECK-NEXT: [397.12,   434.72,   472.32,   509.92],
-    // CHECK-NEXT: [405.48,   443.88,   482.28,   520.68],
-    // CHECK-NEXT: [413.84,   453.04,   492.24,   531.44]]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 16
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+    // CHECK-NEXT: values : ( 388.76, 425.56, 462.36, 499.16, 397.12, 434.72, 472.32, 509.92, 405.48, 443.88, 482.28, 520.68, 413.84, 453.04, 492.24, 531.44
+    // CHECK-NEXT: ----
     //
-    %c2 = sparse_tensor.convert %2 : tensor<4x4xf64, #DCSR> to tensor<4x4xf64>
-    %c2u = tensor.cast %c2 : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c2u) : (tensor<*xf64>) -> ()
+    sparse_tensor.print %2 : tensor<4x4xf64, #DCSR>
 
     //
     // CHECK:      {{\[}}[86.08,   94.28,   102.48,   110.68],
@@ -237,24 +300,26 @@ module {
     call @printMemrefF64(%u3) : (tensor<*xf64>) -> ()
 
     //
-    // CHECK:      {{\[}}[86.08,   94.28,   102.48,   110.68],
-    // CHECK-NEXT: [0,   0,   0,   0],
-    // CHECK-NEXT: [23.46,   25.76,   28.06,   30.36],
-    // CHECK-NEXT: [10.8,   11.8,   12.8,   13.8]]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 12
+    // CHECK-NEXT: pos[1] : ( 0, 4, 4, 8, 12
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+    // CHECK-NEXT: values : ( 86.08, 94.28, 102.48, 110.68, 23.46, 25.76, 28.06, 30.36, 10.8, 11.8, 12.8, 13.8
+    // CHECK-NEXT: ----
     //
-    %c4 = sparse_tensor.convert %4 : tensor<4x4xf64, #CSR> to tensor<4x4xf64>
-    %c4u = tensor.cast %c4 : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c4u) : (tensor<*xf64>) -> ()
+    sparse_tensor.print %4 : tensor<4x4xf64, #CSR>
 
     //
-    // CHECK:      {{\[}}[86.08,   94.28,   102.48,   110.68],
-    // CHECK-NEXT: [0,   0,   0,   0],
-    // CHECK-NEXT: [23.46,   25.76,   28.06,   30.36],
-    // CHECK-NEXT: [10.8,   11.8,   12.8,   13.8]]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 12
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+    // CHECK-NEXT: values : ( 86.08, 94.28, 102.48, 110.68, 23.46, 25.76, 28.06, 30.36, 10.8, 11.8, 12.8, 13.8
+    // CHECK-NEXT: ----
     //
-    %c5 = sparse_tensor.convert %5 : tensor<4x4xf64, #DCSR> to tensor<4x4xf64>
-    %c5u = tensor.cast %c5 : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c5u) : (tensor<*xf64>) -> ()
+    sparse_tensor.print %5 : tensor<4x4xf64, #DCSR>
 
     //
     // CHECK:      {{\[}}[0,   30.5,   4.2,   0],
@@ -266,46 +331,26 @@ module {
     call @printMemrefF64(%u6) : (tensor<*xf64>) -> ()
 
     //
-    // CHECK:      {{\[}}[0,   30.5,   4.2,   0],
-    // CHECK-NEXT: [0,   0,   0,   0],
-    // CHECK-NEXT: [0,   0,   4.6,   0],
-    // CHECK-NEXT: [0,   0,   7,   8]]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 5
+    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 3, 5
+    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3
+    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8
+    // CHECK-NEXT: ----
     //
-    %c7 = sparse_tensor.convert %7 : tensor<4x4xf64, #CSR> to tensor<4x4xf64>
-    %c7u = tensor.cast %c7 : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c7u) : (tensor<*xf64>) -> ()
+    sparse_tensor.print %7 : tensor<4x4xf64, #CSR>
 
     //
-    // CHECK:      {{\[}}[0,   30.5,   4.2,   0],
-    // CHECK-NEXT: [0,   0,   0,   0],
-    // CHECK-NEXT: [0,   0,   4.6,   0],
-    // CHECK-NEXT: [0,   0,   7,   8]]
-    //
-    %c8 = sparse_tensor.convert %8 : tensor<4x4xf64, #DCSR> to tensor<4x4xf64>
-    %c8u = tensor.cast %c8 : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c8u) : (tensor<*xf64>) -> ()
-
-    //
-    // Sanity check on nonzeros.
-    //
-    // CHECK: [30.5,  4.2,  4.6,  7,  8{{.*}}]
-    // CHECK: [30.5,  4.2,  4.6,  7,  8{{.*}}]
-    //
-    %val7 = sparse_tensor.values %7 : tensor<4x4xf64, #CSR> to memref<?xf64>
-    %val8 = sparse_tensor.values %8 : tensor<4x4xf64, #DCSR> to memref<?xf64>
-    call @printMemref1dF64(%val7) : (memref<?xf64>) -> ()
-    call @printMemref1dF64(%val8) : (memref<?xf64>) -> ()
-
-    //
-    // Sanity check on stored entries after the computations.
-    //
-    // CHECK-NEXT: 5
-    // CHECK-NEXT: 5
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 5
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 5
+    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3
+    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8
+    // CHECK-NEXT: ----
     //
-    %noe7 = sparse_tensor.number_of_entries %7 : tensor<4x4xf64, #CSR>
-    %noe8 = sparse_tensor.number_of_entries %8 : tensor<4x4xf64, #DCSR>
-    vector.print %noe7 : index
-    vector.print %noe8 : index
+    sparse_tensor.print %8 : tensor<4x4xf64, #DCSR>
 
     // Release the resources.
     bufferization.dealloc_tensor %a1 : tensor<4x8xf64, #CSR>
@@ -316,12 +361,6 @@ module {
     bufferization.dealloc_tensor %b2 : tensor<8x4xf64, #DCSR>
     bufferization.dealloc_tensor %b3 : tensor<8x4xf64, #CSR>
     bufferization.dealloc_tensor %b4 : tensor<8x4xf64, #DCSR>
-    bufferization.dealloc_tensor %c1 : tensor<4x4xf64>
-    bufferization.dealloc_tensor %c2 : tensor<4x4xf64>
-    bufferization.dealloc_tensor %c4 : tensor<4x4xf64>
-    bufferization.dealloc_tensor %c5 : tensor<4x4xf64>
-    bufferization.dealloc_tensor %c7 : tensor<4x4xf64>
-    bufferization.dealloc_tensor %c8 : tensor<4x4xf64>
     bufferization.dealloc_tensor %0 : tensor<4x4xf64>
     bufferization.dealloc_tensor %1 : tensor<4x4xf64, #CSR>
     bufferization.dealloc_tensor %2 : tensor<4x4xf64, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir
index 96c8a30ade8e42..a7184c044569ca 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -132,7 +132,7 @@ module {
   //
   // Main driver.
   //
-  func.func @entry() {
+  func.func @main() {
     %c_0 = arith.constant 0 : index
     %c_1 = arith.constant 1 : index
     %c_2 = arith.constant 2 : index
@@ -170,14 +170,16 @@ module {
 
     // DCSR test
     //
-    // CHECK:       [0,   30.5,   4.2,   0],
-    // CHECK-NEXT:  [0,   0,   0,   0],
-    // CHECK-NEXT:  [0,   0,   4.6,   0],
-    // CHECK-NEXT:  [0,   0,   7,   8]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 5
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 5
+    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3
+    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8
+    // CHECK-NEXT: ----
     //
-    %c2 = sparse_tensor.convert %2 : tensor<4x4xf64, #DCSR> to tensor<4x4xf64>
-    %c2u = tensor.cast %c2 : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c2u) : (tensor<*xf64>) -> ()
+    sparse_tensor.print %2 : tensor<4x4xf64, #DCSR>
 
     %t1 = sparse_tensor.convert %sa : tensor<8x8xf64> to tensor<8x8xf64, #CSR>
     %a1 = tensor.extract_slice %t1[0, 0][4, 8][1, 1] : tensor<8x8xf64, #CSR> to tensor<4x8xf64, #CSR_SLICE>
@@ -188,63 +190,64 @@ module {
 
     // CSR test
     //
-    // CHECK:       [0,   30.5,   4.2,   0],
-    // CHECK-NEXT:  [0,   0,   0,   0],
-    // CHECK-NEXT:  [0,   0,   4.6,   0],
-    // CHECK-NEXT:  [0,   0,   7,   8]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 5
+    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 3, 5
+    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3
+    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8
+    // CHECK-NEXT: ----
     //
-    %c3 = sparse_tensor.convert %3 : tensor<4x4xf64, #CSR> to tensor<4x4xf64>
-    %c3u = tensor.cast %c3 : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c3u) : (tensor<*xf64>) -> ()
+    sparse_tensor.print %3 : tensor<4x4xf64, #CSR>
+
 
     // slice x slice
     //
-    // CHECK:      [2.3,   0,   0,   0],
-    // CHECK-NEXT: [6.9,   0,   0,   0],
-    // CHECK-NEXT: [0,   0,   0,   0],
-    // CHECK-NEXT: [12.6,   0,   0,   0]]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 3
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 2, 3
+    // CHECK-NEXT: crd[1] : ( 0, 0, 0
+    // CHECK-NEXT: values : ( 2.3, 6.9, 12.6
+    // CHECK-NEXT: ----
     //
+    sparse_tensor.print %4 : tensor<4x4xf64, #CSR>
+
     %s1 = tensor.extract_slice %tmp[0, 1][4, 4][2, 1] : tensor<8x8xf64, #DCSR> to tensor<4x4xf64, #DCSR_SLICE_1>
     %s2 = tensor.extract_slice %b1[0, 0][4, 4][2, 1] : tensor<8x4xf64, #CSR> to tensor<4x4xf64, #CSR_SLICE_1>
     %4 = call @matmul1(%s2, %s1)
        : (tensor<4x4xf64, #CSR_SLICE_1>,
           tensor<4x4xf64, #DCSR_SLICE_1>) -> tensor<4x4xf64, #CSR>
-    %c4 = sparse_tensor.convert %4 : tensor<4x4xf64, #CSR> to tensor<4x4xf64>
-    %c4u = tensor.cast %c4 : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c4u) : (tensor<*xf64>) -> ()
 
     // slice coo x slice coo
     //
-    // CHECK:      [2.3,   0,   0,   0],
-    // CHECK-NEXT: [6.9,   0,   0,   0],
-    // CHECK-NEXT: [0,   0,   0,   0],
-    // CHECK-NEXT: [12.6,   0,   0,   0]]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 3
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 0, 1, 0, 3, 0
+    // CHECK-NEXT: values : ( 2.3, 6.9, 12.6
+    // CHECK-NEXT: ----
     //
+    sparse_tensor.print %o_coo : tensor<4x4xf64, #COO>
     %t1_coo = sparse_tensor.convert %sa : tensor<8x8xf64> to tensor<8x8xf64, #COO>
     %b1_coo = sparse_tensor.convert %sb : tensor<8x4xf64> to tensor<8x4xf64, #COO>
     %s2_coo = tensor.extract_slice %b1_coo[0, 0][4, 4][2, 1] : tensor<8x4xf64, #COO> to tensor<4x4xf64, #COO_SLICE_1>
     %s1_coo = tensor.extract_slice %t1_coo[0, 1][4, 4][2, 1] : tensor<8x8xf64, #COO> to tensor<4x4xf64, #COO_SLICE_2>
     %o_coo = call @matmul5(%s2_coo, %s1_coo) : (tensor<4x4xf64, #COO_SLICE_1>, tensor<4x4xf64, #COO_SLICE_2>) -> tensor<4x4xf64, #COO>
 
-    %c4_coo = sparse_tensor.convert %o_coo : tensor<4x4xf64, #COO> to tensor<4x4xf64>
-    %c4u_coo = tensor.cast %c4_coo : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c4u_coo) : (tensor<*xf64>) -> ()
-
     // slice x slice (same as above, but with dynamic stride information)
     //
-    // CHECK:      [2.3,   0,   0,   0],
-    // CHECK-NEXT: [6.9,   0,   0,   0],
-    // CHECK-NEXT: [0,   0,   0,   0],
-    // CHECK-NEXT: [12.6,   0,   0,   0]]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 3
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 2, 3
+    // CHECK-NEXT: crd[1] : ( 0, 0, 0
+    // CHECK-NEXT: values : ( 2.3, 6.9, 12.6
+    // CHECK-NEXT: ----
     //
+    sparse_tensor.print %dyn_4 : tensor<4x4xf64, #CSR>
     %s1_dyn = tensor.extract_slice %tmp[%c_0, %c_1][4, 4][%c_2, %c_1] : tensor<8x8xf64, #DCSR> to tensor<4x4xf64, #DCSR_SLICE_dyn>
     %s2_dyn = tensor.extract_slice %b1[%c_0, %c_0][4, 4][%c_2, %c_1] : tensor<8x4xf64, #CSR> to tensor<4x4xf64, #CSR_SLICE_dyn>
     %dyn_4 = call @matmul_dyn(%s2_dyn, %s1_dyn)
        : (tensor<4x4xf64, #CSR_SLICE_dyn>,
           tensor<4x4xf64, #DCSR_SLICE_dyn>) -> tensor<4x4xf64, #CSR>
-    %c4_dyn = sparse_tensor.convert %dyn_4 : tensor<4x4xf64, #CSR> to tensor<4x4xf64>
-    %c4u_dyn = tensor.cast %c4_dyn : tensor<4x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%c4u_dyn) : (tensor<*xf64>) -> ()
 
     // sparse slices should generate the same result as dense slices
     //
@@ -265,11 +268,6 @@ module {
     call @printMemrefF64(%du) : (tensor<*xf64>) -> ()
 
     // Releases resources.
-    bufferization.dealloc_tensor %c2 : tensor<4x4xf64>
-    bufferization.dealloc_tensor %c3 : tensor<4x4xf64>
-    bufferization.dealloc_tensor %c4 : tensor<4x4xf64>
-    bufferization.dealloc_tensor %c4_coo : tensor<4x4xf64>
-    bufferization.dealloc_tensor %c4_dyn : tensor<4x4xf64>
     bufferization.dealloc_tensor %d : tensor<4x4xf64>
     bufferization.dealloc_tensor %b1 : tensor<8x4xf64, #CSR>
     bufferization.dealloc_tensor %t1 : tensor<8x8xf64, #CSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matrix_ops.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matrix_ops.mlir
index 2cecc242034389..2cef46f4cb1546 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matrix_ops.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matrix_ops.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -61,8 +61,6 @@
 }
 
 module {
-  func.func private @printMemrefF64(%ptr : tensor<*xf64>)
-
   // Scales a sparse matrix into a new sparse matrix.
   func.func @matrix_scale(%arga: tensor<?x?xf64, #DCSR>) -> tensor<?x?xf64, #DCSR> {
     %s = arith.constant 2.0 : f64
@@ -129,17 +127,8 @@ module {
     return %0 : tensor<?x?xf64, #DCSR>
   }
 
-  // Dump a sparse matrix.
-  func.func @dump(%arg0: tensor<?x?xf64, #DCSR>) {
-    %dm = sparse_tensor.convert %arg0 : tensor<?x?xf64, #DCSR> to tensor<?x?xf64>
-    %u = tensor.cast %dm : tensor<?x?xf64> to tensor<*xf64>
-    call @printMemrefF64(%u) : (tensor<*xf64>) -> ()
-    bufferization.dealloc_tensor %dm : tensor<?x?xf64>
-    return
-  }
-
   // Driver method to call and verify matrix kernels.
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %d1 = arith.constant 1.1 : f64
 
@@ -170,37 +159,76 @@ module {
     //
     // Verify the results.
     //
-    // CHECK:      {{\[}}[1,   2,   0,   0,   0,   0,   0,   0],
-    // CHECK-NEXT: [0,   0,   0,   0,   0,   0,   0,   3],
-    // CHECK-NEXT: [0,   0,   4,   0,   5,   0,   0,   6],
-    // CHECK-NEXT: [7,   0,   8,   9,   0,   0,   0,   0]]
-    // CHECK:      {{\[}}[6,   0,   0,   0,   0,   0,   0,   5],
-    // CHECK-NEXT: [4,   0,   0,   0,   0,   0,   3,   0],
-    // CHECK-NEXT: [0,   2,   0,   0,   0,   0,   0,   1],
-    // CHECK-NEXT: [0,   0,   0,   0,   0,   0,   0,   0]]
-    // CHECK:      {{\[}}[2,   4,   0,   0,   0,   0,   0,   0],
-    // CHECK-NEXT: [0,   0,   0,   0,   0,   0,   0,   6],
-    // CHECK-NEXT: [0,   0,   8,   0,   10,   0,   0,   12],
-    // CHECK-NEXT: [14,   0,   16,   18,   0,   0,   0,   0]]
-    // CHECK:      {{\[}}[2,   4,   0,   0,   0,   0,   0,   0],
-    // CHECK-NEXT: [0,   0,   0,   0,   0,   0,   0,   6],
-    // CHECK-NEXT: [0,   0,   8,   0,   10,   0,   0,   12],
-    // CHECK-NEXT: [14,   0,   16,   18,   0,   0,   0,   0]]
-    // CHECK:      {{\[}}[8,   4,   0,   0,   0,   0,   0,   5],
-    // CHECK-NEXT: [4,   0,   0,   0,   0,   0,   3,   6],
-    // CHECK-NEXT: [0,   2,   8,   0,   10,   0,   0,   13],
-    // CHECK-NEXT: [14,   0,   16,   18,   0,   0,   0,   0]]
-    // CHECK:      {{\[}}[12,   0,   0,   0,   0,   0,   0,   0],
-    // CHECK-NEXT: [0,   0,   0,   0,   0,   0,   0,   0],
-    // CHECK-NEXT: [0,   0,   0,   0,   0,   0,   0,   12],
-    // CHECK-NEXT: [0,   0,   0,   0,   0,   0,   0,   0]]
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 9
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
+    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %sm1 : tensor<?x?xf64, #DCSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 6
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6
+    // CHECK-NEXT: crd[1] : ( 0, 7, 0, 6, 1, 7
+    // CHECK-NEXT: values : ( 6, 5, 4, 3, 2, 1
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %sm2 : tensor<?x?xf64, #DCSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 9
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
+    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3
+    // CHECK-NEXT: values : ( 2, 4, 6, 8, 10, 12, 14, 16, 18
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %0 : tensor<?x?xf64, #DCSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 9
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
+    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3
+    // CHECK-NEXT: values : ( 2, 4, 6, 8, 10, 12, 14, 16, 18
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %1 : tensor<?x?xf64, #DCSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 13
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6, 10, 13
+    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 0, 6, 7, 1, 2, 4, 7, 0, 2, 3
+    // CHECK-NEXT: values : ( 8, 4, 5, 4, 3, 6, 2, 8, 10, 13, 14, 16, 18
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %2 : tensor<?x?xf64, #DCSR>
+
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 2
+    // CHECK-NEXT: pos[0] : ( 0, 2
+    // CHECK-NEXT: crd[0] : ( 0, 2
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2
+    // CHECK-NEXT: crd[1] : ( 0, 7
+    // CHECK-NEXT: values : ( 12, 12
+    // CHECK-NEXT: ----
     //
-    call @dump(%sm1) : (tensor<?x?xf64, #DCSR>) -> ()
-    call @dump(%sm2) : (tensor<?x?xf64, #DCSR>) -> ()
-    call @dump(%0) : (tensor<?x?xf64, #DCSR>) -> ()
-    call @dump(%1) : (tensor<?x?xf64, #DCSR>) -> ()
-    call @dump(%2) : (tensor<?x?xf64, #DCSR>) -> ()
-    call @dump(%3) : (tensor<?x?xf64, #DCSR>) -> ()
+    sparse_tensor.print %3 : tensor<?x?xf64, #DCSR>
 
     // Release the resources.
     bufferization.dealloc_tensor %sm1 : tensor<?x?xf64, #DCSR>

From 45d82f33af9334907dde39cc69921c679299114e Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 29 Feb 2024 11:02:30 -0800
Subject: [PATCH 54/55] [SLP]Fix miscompilation, cause by incorrect final node
 reordering.

Need to use the regular reordering from the correct node for the final
store/insertelement node to avoid miscommilation.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp        | 4 ++--
 llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2b7d518c1c1a78..94b7c4952f055e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5158,7 +5158,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
           OrderedEntries.insert(Data.first);
         }
       } else {
-        reorderOrder(Data.first->ReorderIndices, Mask, /*BottomOrder=*/true);
+        reorderOrder(Data.first->ReorderIndices, Mask);
       }
     }
   }
@@ -8102,7 +8102,7 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
                                                    unsigned Idx) const {
   Value *Op = E->getOperand(Idx).front();
   if (const TreeEntry *TE = getTreeEntry(Op)) {
-    if (find_if(E->UserTreeIndices, [&](const EdgeInfo &EI) {
+    if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
           return EI.EdgeIdx == Idx && EI.UserTE == E;
         }) != TE->UserTreeIndices.end())
       return TE;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll
index c45e1e8b17bf8c..1940e1bc8d18ac 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll
@@ -14,7 +14,7 @@ define void @test(ptr noalias %arg, ptr noalias %arg1, ptr %arg2) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[TMP3]], <4 x float> [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
 ; CHECK-NEXT:    store <4 x float> [[TMP6]], ptr [[TMP_I_I4]], align 8
 ; CHECK-NEXT:    ret void
 ;

From 73aab2f69773324ef0250f093bd4d782beee921a Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Thu, 29 Feb 2024 14:43:53 -0500
Subject: [PATCH 55/55] [libc] Revert
 https://github.com/llvm/llvm-project/pull/83199 since it broke Fuchsia.
 (#83374)

With some header fix forward for GPU builds.
---
 libc/benchmarks/automemcpy/lib/CMakeLists.txt              | 2 +-
 libc/cmake/modules/LLVMLibCObjectRules.cmake               | 4 ----
 libc/cmake/modules/LLVMLibCTestRules.cmake                 | 4 ----
 libc/cmake/modules/compiler_features/check_fixed_point.cpp | 2 +-
 libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp              | 2 +-
 libc/include/llvm-libc-types/float128.h                    | 2 +-
 libc/src/__support/CPP/limits.h                            | 2 +-
 libc/src/__support/CPP/type_traits/is_fixed_point.h        | 2 +-
 libc/src/__support/HashTable/table.h                       | 2 +-
 libc/src/__support/RPC/rpc_client.h                        | 2 +-
 libc/src/__support/fixed_point/fx_bits.h                   | 2 +-
 libc/src/__support/fixed_point/fx_rep.h                    | 2 +-
 libc/src/__support/fixed_point/sqrt.h                      | 2 +-
 libc/src/__support/macros/properties/types.h               | 4 ++--
 libc/src/stdfix/abshk.h                                    | 2 +-
 libc/src/stdfix/abshr.h                                    | 2 +-
 libc/src/stdfix/absk.h                                     | 2 +-
 libc/src/stdfix/abslk.h                                    | 2 +-
 libc/src/stdfix/abslr.h                                    | 2 +-
 libc/src/stdfix/absr.h                                     | 2 +-
 libc/src/stdfix/roundhk.h                                  | 2 +-
 libc/src/stdfix/roundhr.h                                  | 2 +-
 libc/src/stdfix/roundk.h                                   | 2 +-
 libc/src/stdfix/roundlk.h                                  | 2 +-
 libc/src/stdfix/roundlr.h                                  | 2 +-
 libc/src/stdfix/roundr.h                                   | 2 +-
 libc/src/stdfix/rounduhk.h                                 | 2 +-
 libc/src/stdfix/rounduhr.h                                 | 2 +-
 libc/src/stdfix/rounduk.h                                  | 2 +-
 libc/src/stdfix/roundulk.h                                 | 2 +-
 libc/src/stdfix/roundulr.h                                 | 2 +-
 libc/src/stdfix/roundur.h                                  | 2 +-
 libc/src/stdfix/sqrtuhk.h                                  | 2 +-
 libc/src/stdfix/sqrtuhr.h                                  | 2 +-
 libc/src/stdfix/sqrtuk.h                                   | 2 +-
 libc/src/stdfix/sqrtulr.h                                  | 2 +-
 libc/src/stdfix/sqrtur.h                                   | 2 +-
 libc/src/stdio/printf_core/fixed_converter.h               | 2 +-
 libc/src/stdio/printf_core/parser.h                        | 2 +-
 libc/src/sys/epoll/epoll_pwait.h                           | 4 ++--
 libc/src/sys/epoll/epoll_pwait2.h                          | 6 +++---
 libc/src/sys/epoll/epoll_wait.h                            | 2 +-
 libc/src/sys/epoll/linux/epoll_pwait.cpp                   | 4 ++--
 libc/src/sys/epoll/linux/epoll_pwait2.cpp                  | 6 +++---
 libc/src/sys/epoll/linux/epoll_wait.cpp                    | 4 ++--
 libc/test/UnitTest/CMakeLists.txt                          | 3 +--
 libc/test/UnitTest/LibcTest.cpp                            | 2 +-
 libc/test/include/stdbit_test.cpp                          | 2 +-
 libc/test/include/stdckdint_test.cpp                       | 2 +-
 libc/test/include/sys/queue_test.cpp                       | 2 +-
 libc/test/integration/startup/CMakeLists.txt               | 1 -
 libc/test/integration/startup/gpu/rpc_interface_test.cpp   | 2 +-
 libc/test/integration/startup/gpu/rpc_stream_test.cpp      | 2 +-
 libc/test/integration/startup/gpu/rpc_test.cpp             | 2 +-
 libc/test/src/__support/fixed_point/fx_bits_test.cpp       | 2 +-
 libc/test/src/compiler/stack_chk_guard_test.cpp            | 2 +-
 libc/test/src/math/differential_testing/CMakeLists.txt     | 1 -
 libc/utils/LibcTableGenUtil/CMakeLists.txt                 | 2 +-
 libc/utils/gpu/loader/Loader.h                             | 2 +-
 59 files changed, 63 insertions(+), 74 deletions(-)

diff --git a/libc/benchmarks/automemcpy/lib/CMakeLists.txt b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
index bb6a5631f2c3f6..e66b9045b6074a 100644
--- a/libc/benchmarks/automemcpy/lib/CMakeLists.txt
+++ b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
@@ -19,7 +19,7 @@ add_custom_command(
 add_library(automemcpy_implementations "${Implementations}")
 target_link_libraries(automemcpy_implementations PUBLIC LLVMSupport libc-memory-benchmark)
 target_include_directories(automemcpy_implementations PRIVATE 
-                           ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+                           ${LIBC_SOURCE_DIR} ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
 target_compile_options(automemcpy_implementations PRIVATE ${LIBC_COMPILE_OPTIONS_NATIVE} "SHELL:-mllvm -combiner-global-alias-analysis" -fno-builtin)
 llvm_update_compile_flags(automemcpy_implementations)
 
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 5469799f023983..0649e9f7a76709 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -59,7 +59,6 @@ function(create_object_library fq_target_name)
   )
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   target_compile_options(${fq_target_name} PRIVATE ${compile_options})
 
   # The NVPTX target is installed as LLVM-IR but the internal testing toolchain
@@ -74,7 +73,6 @@ function(create_object_library fq_target_name)
     )
     target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
     target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
     target_compile_options(${internal_target_name} PRIVATE ${compile_options}
                            -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
@@ -281,7 +279,6 @@ function(create_entrypoint_object fq_target_name)
   target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
   target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   add_dependencies(${internal_target_name} ${full_deps_list})
   target_link_libraries(${internal_target_name} ${full_deps_list})
 
@@ -303,7 +300,6 @@ function(create_entrypoint_object fq_target_name)
   target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   add_dependencies(${fq_target_name} ${full_deps_list})
   target_link_libraries(${fq_target_name} ${full_deps_list})
 
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 5981d427b71f8d..836e15d34741b2 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -184,7 +184,6 @@ function(create_libc_unittest fq_target_name)
   )
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
   if(NOT LIBC_UNITTEST_CXX_STANDARD)
@@ -318,7 +317,6 @@ function(add_libc_fuzzer target_name)
   )
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
 
   target_link_libraries(${fq_target_name} PRIVATE
     ${link_object_files}
@@ -459,7 +457,6 @@ function(add_integration_test test_name)
       PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
 
   _get_hermetic_test_compile_options(compile_options "${INTEGRATION_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
@@ -635,7 +632,6 @@ function(add_libc_hermetic_test test_name)
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
diff --git a/libc/cmake/modules/compiler_features/check_fixed_point.cpp b/libc/cmake/modules/compiler_features/check_fixed_point.cpp
index 9199340fe652ea..a5192697d43f77 100644
--- a/libc/cmake/modules/compiler_features/check_fixed_point.cpp
+++ b/libc/cmake/modules/compiler_features/check_fixed_point.cpp
@@ -1,4 +1,4 @@
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 #ifndef LIBC_COMPILER_HAS_FIXED_POINT
 #error unsupported
diff --git a/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
index c385c3a8f3e44a..b4a8621891203d 100644
--- a/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
+++ b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 #include "src/stdio/snprintf.h"
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/fixed_point/fx_rep.h"
 
diff --git a/libc/include/llvm-libc-types/float128.h b/libc/include/llvm-libc-types/float128.h
index e2dc18c040d99e..0b290c676ecc02 100644
--- a/libc/include/llvm-libc-types/float128.h
+++ b/libc/include/llvm-libc-types/float128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_TYPES_FLOAT128_H
 #define LLVM_LIBC_TYPES_FLOAT128_H
 
-#include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
+#include "../llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
 
 // Currently, C23 `_Float128` type is only defined as a built-in type in GCC 7
 // or later, and only for C.  For C++, or for clang, `__float128` is defined
diff --git a/libc/src/__support/CPP/limits.h b/libc/src/__support/CPP/limits.h
index 6440e8cb358fa7..1ffde5f9556f87 100644
--- a/libc/src/__support/CPP/limits.h
+++ b/libc/src/__support/CPP/limits.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_LIMITS_H
 #define LLVM_LIBC_SRC___SUPPORT_CPP_LIMITS_H
 
-#include "llvm-libc-macros/limits-macros.h" // CHAR_BIT
+#include "include/llvm-libc-macros/limits-macros.h" // CHAR_BIT
 #include "src/__support/CPP/type_traits/is_integral.h"
 #include "src/__support/CPP/type_traits/is_signed.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
diff --git a/libc/src/__support/CPP/type_traits/is_fixed_point.h b/libc/src/__support/CPP/type_traits/is_fixed_point.h
index 09dba8b7ecbcd4..025268bc2979d3 100644
--- a/libc/src/__support/CPP/type_traits/is_fixed_point.h
+++ b/libc/src/__support/CPP/type_traits/is_fixed_point.h
@@ -12,7 +12,7 @@
 #include "src/__support/CPP/type_traits/remove_cv.h"
 #include "src/__support/macros/attributes.h"
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
index 07fcd42c97f3e7..8f6c5887c189e8 100644
--- a/libc/src/__support/HashTable/table.h
+++ b/libc/src/__support/HashTable/table.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_HASHTABLE_TABLE_H
 #define LLVM_LIBC_SRC___SUPPORT_HASHTABLE_TABLE_H
 
-#include "llvm-libc-types/ENTRY.h"
+#include "include/llvm-libc-types/ENTRY.h"
 #include "src/__support/CPP/bit.h" // bit_ceil
 #include "src/__support/CPP/new.h"
 #include "src/__support/HashTable/bitmask.h"
diff --git a/libc/src/__support/RPC/rpc_client.h b/libc/src/__support/RPC/rpc_client.h
index 571d7cce2a8039..6e1827dbfeea92 100644
--- a/libc/src/__support/RPC/rpc_client.h
+++ b/libc/src/__support/RPC/rpc_client.h
@@ -11,7 +11,7 @@
 
 #include "rpc.h"
 
-#include "llvm-libc-types/rpc_opcodes_t.h"
+#include "include/llvm-libc-types/rpc_opcodes_t.h"
 
 namespace LIBC_NAMESPACE {
 namespace rpc {
diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h
index 6fdbc6f6ece63f..41da45c01e4e19 100644
--- a/libc/src/__support/fixed_point/fx_bits.h
+++ b/libc/src/__support/fixed_point/fx_bits.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXED_POINT_FX_BITS_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXED_POINT_FX_BITS_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
diff --git a/libc/src/__support/fixed_point/fx_rep.h b/libc/src/__support/fixed_point/fx_rep.h
index e1fee62f335eb9..f8593a93684cbc 100644
--- a/libc/src/__support/fixed_point/fx_rep.h
+++ b/libc/src/__support/fixed_point/fx_rep.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXED_POINT_FX_REP_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXED_POINT_FX_REP_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE, LIBC_INLINE_VAR
 
diff --git a/libc/src/__support/fixed_point/sqrt.h b/libc/src/__support/fixed_point/sqrt.h
index 236ebb2857030b..d8df294b18a1a8 100644
--- a/libc/src/__support/fixed_point/sqrt.h
+++ b/libc/src/__support/fixed_point/sqrt.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
diff --git a/libc/src/__support/macros/properties/types.h b/libc/src/__support/macros/properties/types.h
index e812a9dfcfd8ab..0a3c834663cc92 100644
--- a/libc/src/__support/macros/properties/types.h
+++ b/libc/src/__support/macros/properties/types.h
@@ -10,8 +10,8 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
 
-#include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
-#include "llvm-libc-types/float128.h"      // float128
+#include "include/llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
+#include "include/llvm-libc-types/float128.h"      // float128
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
 #include "src/__support/macros/properties/cpu_features.h"
diff --git a/libc/src/stdfix/abshk.h b/libc/src/stdfix/abshk.h
index 80dc73053dfb45..13c9300caab883 100644
--- a/libc/src/stdfix/abshk.h
+++ b/libc/src/stdfix/abshk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSHK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSHK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abshr.h b/libc/src/stdfix/abshr.h
index 035f9a6de222e8..5acd0cfc4a60db 100644
--- a/libc/src/stdfix/abshr.h
+++ b/libc/src/stdfix/abshr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSHR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSHR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/absk.h b/libc/src/stdfix/absk.h
index 426415de28e6ae..73dfcac0ac8e7f 100644
--- a/libc/src/stdfix/absk.h
+++ b/libc/src/stdfix/absk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abslk.h b/libc/src/stdfix/abslk.h
index 21e33f856bfc65..7de116fa227932 100644
--- a/libc/src/stdfix/abslk.h
+++ b/libc/src/stdfix/abslk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSLK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSLK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abslr.h b/libc/src/stdfix/abslr.h
index ebca35e58aa510..bf5b585bbbb669 100644
--- a/libc/src/stdfix/abslr.h
+++ b/libc/src/stdfix/abslr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSLR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSLR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/absr.h b/libc/src/stdfix/absr.h
index 2744fcb5a7eccc..b5ead7ce14e2a0 100644
--- a/libc/src/stdfix/absr.h
+++ b/libc/src/stdfix/absr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundhk.h b/libc/src/stdfix/roundhk.h
index 06de5cc05cdbe4..9a5c874cc030db 100644
--- a/libc/src/stdfix/roundhk.h
+++ b/libc/src/stdfix/roundhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDHK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDHK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundhr.h b/libc/src/stdfix/roundhr.h
index 6729bf5b139973..ba5a67945d6c3b 100644
--- a/libc/src/stdfix/roundhr.h
+++ b/libc/src/stdfix/roundhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDHR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDHR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundk.h b/libc/src/stdfix/roundk.h
index 02fb9a8c9b1a86..e9fa6d8f9c3b8c 100644
--- a/libc/src/stdfix/roundk.h
+++ b/libc/src/stdfix/roundk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundlk.h b/libc/src/stdfix/roundlk.h
index 28be9c00549498..5fa0e90e855a64 100644
--- a/libc/src/stdfix/roundlk.h
+++ b/libc/src/stdfix/roundlk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDLK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDLK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundlr.h b/libc/src/stdfix/roundlr.h
index be97a35a64204d..c015292e8f3f28 100644
--- a/libc/src/stdfix/roundlr.h
+++ b/libc/src/stdfix/roundlr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDLR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDLR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundr.h b/libc/src/stdfix/roundr.h
index 15523f8b6c9a38..b5b1375c882e03 100644
--- a/libc/src/stdfix/roundr.h
+++ b/libc/src/stdfix/roundr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduhk.h b/libc/src/stdfix/rounduhk.h
index d1c4a4416d7636..85ebf2903ec7e9 100644
--- a/libc/src/stdfix/rounduhk.h
+++ b/libc/src/stdfix/rounduhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUHK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUHK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduhr.h b/libc/src/stdfix/rounduhr.h
index 6cecb733dd3b18..1be0aab1f5a79d 100644
--- a/libc/src/stdfix/rounduhr.h
+++ b/libc/src/stdfix/rounduhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUHR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUHR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduk.h b/libc/src/stdfix/rounduk.h
index 4511d69525c5d5..8dae89586c4901 100644
--- a/libc/src/stdfix/rounduk.h
+++ b/libc/src/stdfix/rounduk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundulk.h b/libc/src/stdfix/roundulk.h
index 8bd90beeb830c5..81dfd1dceb6001 100644
--- a/libc/src/stdfix/roundulk.h
+++ b/libc/src/stdfix/roundulk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDULK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDULK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundulr.h b/libc/src/stdfix/roundulr.h
index 65e5c27b1c8531..002fc94907c613 100644
--- a/libc/src/stdfix/roundulr.h
+++ b/libc/src/stdfix/roundulr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDULR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDULR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundur.h b/libc/src/stdfix/roundur.h
index 110e578da79319..72de44b1e0c4e5 100644
--- a/libc/src/stdfix/roundur.h
+++ b/libc/src/stdfix/roundur.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuhk.h b/libc/src/stdfix/sqrtuhk.h
index b57340003fa03c..80000a0079696d 100644
--- a/libc/src/stdfix/sqrtuhk.h
+++ b/libc/src/stdfix/sqrtuhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuhr.h b/libc/src/stdfix/sqrtuhr.h
index 6b629a29de3c88..fd95f0924e8d48 100644
--- a/libc/src/stdfix/sqrtuhr.h
+++ b/libc/src/stdfix/sqrtuhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuk.h b/libc/src/stdfix/sqrtuk.h
index 6bd7a2608716cd..04d0adadde9ad2 100644
--- a/libc/src/stdfix/sqrtuk.h
+++ b/libc/src/stdfix/sqrtuk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUK_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUK_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtulr.h b/libc/src/stdfix/sqrtulr.h
index d1982a6b1c0518..284adaaf35bf59 100644
--- a/libc/src/stdfix/sqrtulr.h
+++ b/libc/src/stdfix/sqrtulr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTULR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTULR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtur.h b/libc/src/stdfix/sqrtur.h
index 13f7d1e5e466ec..df9dfe5a0bf39e 100644
--- a/libc/src/stdfix/sqrtur.h
+++ b/libc/src/stdfix/sqrtur.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUR_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdio/printf_core/fixed_converter.h b/libc/src/stdio/printf_core/fixed_converter.h
index c89971e20686e4..de69c603be6b63 100644
--- a/libc/src/stdio/printf_core/fixed_converter.h
+++ b/libc/src/stdio/printf_core/fixed_converter.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/fixed_point/fx_rep.h"
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index 13fdbf243a22e8..0876116a0bac86 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/str_to_integer.h"
diff --git a/libc/src/sys/epoll/epoll_pwait.h b/libc/src/sys/epoll/epoll_pwait.h
index 16105850d6942e..9dcb55533009f9 100644
--- a/libc/src/sys/epoll/epoll_pwait.h
+++ b/libc/src/sys/epoll/epoll_pwait.h
@@ -10,8 +10,8 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_PWAIT_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "llvm-libc-types/sigset_t.h"
-// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "include/llvm-libc-types/sigset_t.h"
+// #include "include/llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/epoll_pwait2.h b/libc/src/sys/epoll/epoll_pwait2.h
index f7b28d4fbc51d9..622ede6a0f9f9a 100644
--- a/libc/src/sys/epoll/epoll_pwait2.h
+++ b/libc/src/sys/epoll/epoll_pwait2.h
@@ -10,9 +10,9 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_PWAIT2_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "llvm-libc-types/sigset_t.h"
-// #include "llvm-libc-types/struct_epoll_event.h"
-// #include "llvm-libc-types/struct_timespec.h"
+// #include "include/llvm-libc-types/sigset_t.h"
+// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "include/llvm-libc-types/struct_timespec.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/epoll_wait.h b/libc/src/sys/epoll/epoll_wait.h
index 0dc487bba5bdf4..d51c9100846ce0 100644
--- a/libc/src/sys/epoll/epoll_wait.h
+++ b/libc/src/sys/epoll/epoll_wait.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_WAIT_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "include/llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait.cpp b/libc/src/sys/epoll/linux/epoll_pwait.cpp
index e0c13a7a79602f..ee1b4e66e98444 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait.cpp
@@ -15,8 +15,8 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "llvm-libc-types/sigset_t.h"
-// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "include/llvm-libc-types/sigset_t.h"
+// #include "include/llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
index a44b0c2a9f70f0..671dede2a1058d 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
@@ -15,9 +15,9 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "llvm-libc-types/sigset_t.h"
-// #include "llvm-libc-types/struct_epoll_event.h"
-// #include "llvm-libc-types/struct_timespec.h"
+// #include "include/llvm-libc-types/sigset_t.h"
+// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "include/llvm-libc-types/struct_timespec.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_wait.cpp b/libc/src/sys/epoll/linux/epoll_wait.cpp
index b643e2dd720cb6..0c43edf7645454 100644
--- a/libc/src/sys/epoll/linux/epoll_wait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_wait.cpp
@@ -14,8 +14,8 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "llvm-libc-types/sigset_t.h"
-// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "include/llvm-libc-types/sigset_t.h"
+// #include "include/llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 466494f038f4e3..4668f0061975f8 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -26,8 +26,7 @@ function(add_unittest_framework_library name)
       ${TEST_LIB_SRCS}
       ${TEST_LIB_HDRS}
     )
-    target_include_directories(${lib} PUBLIC 
-                               ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include)
+    target_include_directories(${lib} PUBLIC ${LIBC_SOURCE_DIR})
     list(APPEND compile_options -fno-exceptions -fno-rtti)
     if(TARGET libc.src.time.clock)
       target_compile_definitions(${lib} PRIVATE TARGET_SUPPORTS_CLOCK)
diff --git a/libc/test/UnitTest/LibcTest.cpp b/libc/test/UnitTest/LibcTest.cpp
index babd44f9b20630..7b0e4fca83683b 100644
--- a/libc/test/UnitTest/LibcTest.cpp
+++ b/libc/test/UnitTest/LibcTest.cpp
@@ -8,7 +8,7 @@
 
 #include "LibcTest.h"
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/string.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/UInt128.h"
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 84a4cde18b9f40..acb79ca0f3ff11 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -88,7 +88,7 @@ bool stdc_has_single_bit_ul(unsigned long) noexcept { return false; }
 bool stdc_has_single_bit_ull(unsigned long long) noexcept { return false; }
 }
 
-#include "llvm-libc-macros/stdbit-macros.h"
+#include "include/llvm-libc-macros/stdbit-macros.h"
 
 TEST(LlvmLibcStdbitTest, TypeGenericMacroLeadingZeros) {
   EXPECT_EQ(stdc_leading_zeros(static_cast<unsigned char>(0U)), 0xAAU);
diff --git a/libc/test/include/stdckdint_test.cpp b/libc/test/include/stdckdint_test.cpp
index 5ac8c95f4ef26f..1180a6de9efe2e 100644
--- a/libc/test/include/stdckdint_test.cpp
+++ b/libc/test/include/stdckdint_test.cpp
@@ -8,7 +8,7 @@
 
 #include "test/UnitTest/Test.h"
 
-#include "llvm-libc-macros/stdckdint-macros.h"
+#include "include/llvm-libc-macros/stdckdint-macros.h"
 
 TEST(LlvmLibcStdCkdIntTest, Add) {
   int result;
diff --git a/libc/test/include/sys/queue_test.cpp b/libc/test/include/sys/queue_test.cpp
index 48c0e811c61542..c10e48d627caa2 100644
--- a/libc/test/include/sys/queue_test.cpp
+++ b/libc/test/include/sys/queue_test.cpp
@@ -10,7 +10,7 @@
 #include "src/__support/char_vector.h"
 #include "test/UnitTest/Test.h"
 
-#include "llvm-libc-macros/sys-queue-macros.h"
+#include "include/llvm-libc-macros/sys-queue-macros.h"
 
 using LIBC_NAMESPACE::CharVector;
 using LIBC_NAMESPACE::cpp::string;
diff --git a/libc/test/integration/startup/CMakeLists.txt b/libc/test/integration/startup/CMakeLists.txt
index 08c0d978602b80..fb5d6bc787cc26 100644
--- a/libc/test/integration/startup/CMakeLists.txt
+++ b/libc/test/integration/startup/CMakeLists.txt
@@ -31,7 +31,6 @@ function(add_startup_test target_name)
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
-      ${LIBC_SOURCE_DIR}/include
       ${LIBC_BUILD_DIR}
       ${LIBC_BUILD_DIR}/include
   )
diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
index 7bbd7085fc2f45..674e2cc1ed7499 100644
--- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-libc-types/test_rpc_opcodes_t.h"
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/integration/startup/gpu/rpc_stream_test.cpp b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
index 9401f822904d04..09a4ae67256e3a 100644
--- a/libc/test/integration/startup/gpu/rpc_stream_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-libc-types/test_rpc_opcodes_t.h"
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/integer_to_string.h"
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
index bb36b6cedb63cb..4032d890c53ec8 100644
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-libc-types/test_rpc_opcodes_t.h"
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/src/__support/fixed_point/fx_bits_test.cpp b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
index 5670687273d5ba..58627816eb8d97 100644
--- a/libc/test/src/__support/fixed_point/fx_bits_test.cpp
+++ b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-libc-macros/stdfix-macros.h"
+#include "include/llvm-libc-macros/stdfix-macros.h"
 
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/integer_literals.h"
diff --git a/libc/test/src/compiler/stack_chk_guard_test.cpp b/libc/test/src/compiler/stack_chk_guard_test.cpp
index 427e20c2ac5046..6b71e155fa3e4d 100644
--- a/libc/test/src/compiler/stack_chk_guard_test.cpp
+++ b/libc/test/src/compiler/stack_chk_guard_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-libc-macros/signal-macros.h"
+#include "include/llvm-libc-macros/signal-macros.h"
 #include "src/__support/macros/sanitizer.h"
 #include "src/compiler/__stack_chk_fail.h"
 #include "src/string/memset.h"
diff --git a/libc/test/src/math/differential_testing/CMakeLists.txt b/libc/test/src/math/differential_testing/CMakeLists.txt
index 36bfdca1a442d6..878f81f1d573c8 100644
--- a/libc/test/src/math/differential_testing/CMakeLists.txt
+++ b/libc/test/src/math/differential_testing/CMakeLists.txt
@@ -47,7 +47,6 @@ function(add_diff_binary target_name)
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
-      ${LIBC_SOURCE_DIR}/include
   )
   if(DIFF_COMPILE_OPTIONS)
     target_compile_options(
diff --git a/libc/utils/LibcTableGenUtil/CMakeLists.txt b/libc/utils/LibcTableGenUtil/CMakeLists.txt
index 60208ed790d574..dca6a7bb830655 100644
--- a/libc/utils/LibcTableGenUtil/CMakeLists.txt
+++ b/libc/utils/LibcTableGenUtil/CMakeLists.txt
@@ -5,5 +5,5 @@ add_llvm_library(
   DISABLE_LLVM_LINK_LLVM_DYLIB
   LINK_COMPONENTS Support TableGen
 )
-target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include)
+target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR})
 target_include_directories(LibcTableGenUtil PRIVATE ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index d74d65e8993829..e2aabb08c11dac 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -11,7 +11,7 @@
 
 #include "utils/gpu/server/llvmlibc_rpc_server.h"
 
-#include "llvm-libc-types/test_rpc_opcodes_t.h"
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
 
 #include <cstddef>
 #include <cstdint>