From 8a3a0821568ac4a1e5c88e101ee8eea9d503d59d Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Mon, 6 Nov 2023 11:51:43 +0000
Subject: [PATCH 01/40] [Flang][HLFIR] : Use the attributes from the ultimate
 symbol (#71195)

This change is required for hlfir.declares of host-associated symbols in
the OpenMP regions.

Added A FIXME to correctly use the symbol attributes for VOLATILE and
ASYNCHRONOUS.
---
 flang/lib/Lower/ConvertVariable.cpp           |  8 +++++--
 .../Lower/OpenMP/parallel-private-clause.f90  | 24 +++++++++----------
 .../test/Lower/OpenMP/private-commonblock.f90 |  2 +-
 flang/test/Lower/OpenMP/target.f90            |  2 +-
 4 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 57fb9fc432de2f..e8137886d2cf54 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -1645,9 +1645,13 @@ void Fortran::lower::genDeclareSymbol(
       !sym.detailsIf<Fortran::semantics::CommonBlockDetails>()) {
     fir::FirOpBuilder &builder = converter.getFirOpBuilder();
     const mlir::Location loc = genLocation(converter, sym);
+    // FIXME: Using the ultimate symbol for translating symbol attributes will
+    // lead to situations where the VOLATILE/ASYNCHRONOUS attributes are not
+    // propagated to the hlfir.declare (these attributes can be added when
+    // using module variables).
     fir::FortranVariableFlagsAttr attributes =
-        Fortran::lower::translateSymbolAttributes(builder.getContext(), sym,
-                                                  extraFlags);
+        Fortran::lower::translateSymbolAttributes(
+            builder.getContext(), sym.GetUltimate(), extraFlags);
     auto name = converter.mangleName(sym);
     hlfir::EntityWithAttributes declare =
         hlfir::genDeclare(loc, builder, exv, name, attributes);
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/parallel-private-clause.f90
index 5fc77f206250ef..3e46d315f8cc47 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause.f90
@@ -142,13 +142,13 @@ subroutine private_clause_derived_type()
 
 !FIRDialect:   omp.parallel {
 !FIRDialect-DAG:    %[[X_PVT:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", pinned, uniq_name = "{{.*}}Ex"}
-!FIRDialect-DAG:    %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] {uniq_name = "{{.*}}Ex"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+!FIRDialect-DAG:    %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
 !FIRDialect-DAG:    %[[X2_PVT:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", pinned, uniq_name = "{{.*}}Ex2"}
-!FIRDialect-DAG:    %[[X2_PVT_DECL:.*]]:2 = hlfir.declare %[[X2_PVT]] {uniq_name = "{{.*}}Ex2"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+!FIRDialect-DAG:    %[[X2_PVT_DECL:.*]]:2 = hlfir.declare %[[X2_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex2"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 !FIRDialect-DAG:    %[[X3_PVT:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x3", pinned, uniq_name = "{{.*}}Ex3"}
-!FIRDialect-DAG:    %[[X3_PVT_DECL:.*]]:2 = hlfir.declare %[[X3_PVT]] {uniq_name = "{{.*}}Ex3"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+!FIRDialect-DAG:    %[[X3_PVT_DECL:.*]]:2 = hlfir.declare %[[X3_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex3"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
 !FIRDialect-DAG:    %[[X4_PVT:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x4", pinned, uniq_name = "{{.*}}Ex4"}
-!FIRDialect-DAG:    %[[X4_PVT_DECL:.*]]:2 = hlfir.declare %[[X4_PVT]] {uniq_name = "{{.*}}Ex4"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+!FIRDialect-DAG:    %[[X4_PVT_DECL:.*]]:2 = hlfir.declare %[[X4_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex4"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 
 !FIRDialect-DAG:    %[[TMP58:.*]] = fir.load %[[X4_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !FIRDialect-DAG:    %[[TMP97:.*]] = fir.load %[[X4_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -190,7 +190,7 @@ subroutine private_clause_allocatable()
 
 !FIRDialect-DAG:   fir.store %{{.*}} to %[[X5_PVT]] : !fir.ref<!fir.box<!fir.heap<f32>>>
 !FIRDialect-DAG: }
-!FIRDialect-DAG: %[[X5_PVT_DECL:.*]]:2 = hlfir.declare %[[X5_PVT]] {uniq_name = "_QFprivate_clause_real_call_allocatableEx5"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+!FIRDialect-DAG: %[[X5_PVT_DECL:.*]]:2 = hlfir.declare %[[X5_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFprivate_clause_real_call_allocatableEx5"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 !FIRDialect-DAG: fir.call @_QFprivate_clause_real_call_allocatablePhelper_private_clause_real_call_allocatable(%[[X5_PVT_DECL]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> ()
 !FIRDialect-DAG: %{{.*}} = fir.load %[[X5_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
 
@@ -222,7 +222,7 @@ subroutine helper_private_clause_real_call_allocatable(x6)
 !FIRDialect:    %[[P_DECL:.*]]:2 = hlfir.declare %[[P]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFincrement_list_itemsEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>)
 !FIRDialect:    omp.parallel   {
 !FIRDialect:      %[[P_PVT:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", pinned, uniq_name = "_QFincrement_list_itemsEp"}
-!FIRDialect:      %[[P_PVT_DECL:.*]]:2 = hlfir.declare %[[P_PVT]] {uniq_name = "_QFincrement_list_itemsEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>)
+!FIRDialect:      %[[P_PVT_DECL:.*]]:2 = hlfir.declare %[[P_PVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFincrement_list_itemsEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>)
 !FIRDialect:      omp.single   {
 
 !FIRDialect:         omp.terminator
@@ -262,9 +262,9 @@ end subroutine increment_list_items
 !FIRDialect-DAG:  %[[Z2_DECL:.*]]:2 = hlfir.declare %[[Z2]](%12) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFparallel_pointerEz2"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 !FIRDialect:     omp.parallel   {
 !FIRDialect-DAG:    %[[Y1_PVT:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", pinned, uniq_name = "_QFparallel_pointerEy1"}
-!FIRDialect-DAG:    %[[Y1_PVT_DECL:.*]]:2 = hlfir.declare %[[Y1_PVT]] {uniq_name = "_QFparallel_pointerEy1"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!FIRDialect-DAG:    %[[Y1_PVT_DECL:.*]]:2 = hlfir.declare %[[Y1_PVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFparallel_pointerEy1"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 !FIRDialect-DAG:    %[[Y2_PVT:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", pinned, uniq_name = "_QFparallel_pointerEy2"}
-!FIRDialect-DAG:    %[[Y2_PVT_DECL:.*]]:2 = hlfir.declare %[[Y2_PVT]] {uniq_name = "_QFparallel_pointerEy2"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+!FIRDialect-DAG:    %[[Y2_PVT_DECL:.*]]:2 = hlfir.declare %[[Y2_PVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFparallel_pointerEy2"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 !FIRDialect-DAG:    %[[PP18:.*]] = fir.embox %[[Z1_DECL]]#1 : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
 !FIRDialect:       fir.store %[[PP18]] to %[[Y1_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 !FIRDialect-DAG:    %[[PP19:.*]] = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -298,7 +298,7 @@ subroutine simple_loop_1
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     %[[R_DECL:.*]]:2 = hlfir.declare [[R]] {uniq_name = "_QFsimple_loop_1Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+  ! FIRDialect:     %[[R_DECL:.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsimple_loop_1Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
   ! FIRDialect:     %[[WS_LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[WS_UB:.*]] = arith.constant 9 : i32
@@ -336,7 +336,7 @@ subroutine simple_loop_2
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     %[[R_DECL:.*]]:2 = hlfir.declare [[R]] {uniq_name = "{{.*}}Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+  ! FIRDialect:     %[[R_DECL:.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
   ! FIRDialect:     %[[WS_LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[WS_UB:.*]] = arith.constant 9 : i32
@@ -373,7 +373,7 @@ subroutine simple_loop_3
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {uniq_name = "{{.*}}Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+  ! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
   ! FIRDialect:     %[[WS_LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[WS_UB:.*]] = arith.constant 9 : i32
@@ -405,7 +405,7 @@ subroutine simd_loop_1
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {uniq_name = "{{.*}}r"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+  ! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}r"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
   ! FIRDialect:     %[[LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[UB:.*]] = arith.constant 9 : i32
diff --git a/flang/test/Lower/OpenMP/private-commonblock.f90 b/flang/test/Lower/OpenMP/private-commonblock.f90
index 615ddb21129c91..ee580594f7c3fc 100644
--- a/flang/test/Lower/OpenMP/private-commonblock.f90
+++ b/flang/test/Lower/OpenMP/private-commonblock.f90
@@ -100,7 +100,7 @@ subroutine private_clause_commonblock()
 !CHECK:    fir.call @_QPsub4(%[[C_REF]], %[[A_DECL]]#1) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
 !CHECK:    omp.parallel {
 !CHECK:      %[[C_PVT_REF:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.complex<4>>> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_commonblock_pointerEc"}
-!CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblock_pointerEc"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
+!CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_REF]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFprivate_clause_commonblock_pointerEc"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
 !CHECK:      %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFprivate_clause_commonblock_pointerEa"}
 !CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblock_pointerEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:      %[[C_PVT_BOX:.*]] = fir.load %[[C_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 754e13d1a36b50..64e740a818fe03 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -254,7 +254,7 @@ subroutine omp_target_device_addr
    !CHECK: omp.target_data map_entries(%[[MAP]] : {{.*}}) use_device_addr(%[[VAL_0_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>) {
    !$omp target data map(tofrom: a) use_device_addr(a)
    !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
-   !CHECK: %[[VAL_1_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+   !CHECK: %[[VAL_1_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
    !CHECK: %[[C10:.*]] = arith.constant 10 : i32
    !CHECK: %[[A_BOX:.*]] = fir.load %[[VAL_1_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
    !CHECK: %[[A_ADDR:.*]] = fir.box_addr %[[A_BOX]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>

From ff67e851826ccd535efce86db7c1a57e3f95d790 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell <nathan@acm.org>
Date: Mon, 6 Nov 2023 06:58:01 -0500
Subject: [PATCH 02/40] llvm: [NFC] Robustify testcase (#71120)

Robustify the check-not to not trigger on llvm-as's pathname in its initial error message.
---
 llvm/test/Verifier/alias.ll | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Verifier/alias.ll b/llvm/test/Verifier/alias.ll
index 8d755ec6a91c6f..42758cf2c7f37b 100644
--- a/llvm/test/Verifier/alias.ll
+++ b/llvm/test/Verifier/alias.ll
@@ -1,5 +1,7 @@
-; RUN:  not llvm-as %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=alias --implicit-check-not=Alias
+; RUN:  not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
 
+; CHECK: : assembly parsed, but does not verify as correct!
+; CHECK-NOT: {{(^A| a)lias(es)? }}
 
 declare void @f()
 @fa = alias void (), ptr @f

From 96dadc9fc83dddf450e42ea5e9c3fd2616761830 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Mon, 6 Nov 2023 13:14:56 +0100
Subject: [PATCH 03/40] [mlir] support scalable vectors in python bindings
 (#71050)

The scalable dimension functionality was added to the vector type after
the bindings for it were defined, without the bindings being ever
updated. Fix that.
---
 mlir/include/mlir-c/BuiltinTypes.h   | 26 +++++++++++
 mlir/lib/Bindings/Python/IRTypes.cpp | 69 ++++++++++++++++++++++------
 mlir/lib/CAPI/IR/BuiltinTypes.cpp    | 25 ++++++++++
 mlir/test/CAPI/ir.c                  | 34 ++++++++++----
 mlir/test/python/ir/builtin_types.py | 43 ++++++++++++++++-
 5 files changed, 172 insertions(+), 25 deletions(-)

diff --git a/mlir/include/mlir-c/BuiltinTypes.h b/mlir/include/mlir-c/BuiltinTypes.h
index a6d8e10efbde92..1fd5691f41eec3 100644
--- a/mlir/include/mlir-c/BuiltinTypes.h
+++ b/mlir/include/mlir-c/BuiltinTypes.h
@@ -271,6 +271,32 @@ MLIR_CAPI_EXPORTED MlirType mlirVectorTypeGetChecked(MlirLocation loc,
                                                      const int64_t *shape,
                                                      MlirType elementType);
 
+/// Creates a scalable vector type with the shape identified by its rank and
+/// dimensions. A subset of dimensions may be marked as scalable via the
+/// corresponding flag list, which is expected to have as many entries as the
+/// rank of the vector. The vector is created in the same context as the element
+/// type.
+MLIR_CAPI_EXPORTED MlirType mlirVectorTypeGetScalable(intptr_t rank,
+                                                      const int64_t *shape,
+                                                      const bool *scalable,
+                                                      MlirType elementType);
+
+/// Same as "mlirVectorTypeGetScalable" but returns a nullptr wrapping MlirType
+/// on illegal arguments, emitting appropriate diagnostics.
+MLIR_CAPI_EXPORTED
+MlirType mlirVectorTypeGetScalableChecked(MlirLocation loc, intptr_t rank,
+                                          const int64_t *shape,
+                                          const bool *scalable,
+                                          MlirType elementType);
+
+/// Checks whether the given vector type is scalable, i.e., has at least one
+/// scalable dimension.
+MLIR_CAPI_EXPORTED bool mlirVectorTypeIsScalable(MlirType type);
+
+/// Checks whether the "dim"-th dimension of the given vector is scalable.
+MLIR_CAPI_EXPORTED bool mlirVectorTypeIsDimScalable(MlirType type,
+                                                    intptr_t dim);
+
 //===----------------------------------------------------------------------===//
 // Ranked / Unranked Tensor type.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Bindings/Python/IRTypes.cpp b/mlir/lib/Bindings/Python/IRTypes.cpp
index a7ccfbea542f5c..483db673f989e6 100644
--- a/mlir/lib/Bindings/Python/IRTypes.cpp
+++ b/mlir/lib/Bindings/Python/IRTypes.cpp
@@ -462,19 +462,62 @@ class PyVectorType : public PyConcreteType<PyVectorType, PyShapedType> {
   using PyConcreteType::PyConcreteType;
 
   static void bindDerived(ClassTy &c) {
-    c.def_static(
-        "get",
-        [](std::vector<int64_t> shape, PyType &elementType,
-           DefaultingPyLocation loc) {
-          PyMlirContext::ErrorCapture errors(loc->getContext());
-          MlirType t = mlirVectorTypeGetChecked(loc, shape.size(), shape.data(),
-                                                elementType);
-          if (mlirTypeIsNull(t))
-            throw MLIRError("Invalid type", errors.take());
-          return PyVectorType(elementType.getContext(), t);
-        },
-        py::arg("shape"), py::arg("elementType"), py::arg("loc") = py::none(),
-        "Create a vector type");
+    c.def_static("get", &PyVectorType::get, py::arg("shape"),
+                 py::arg("elementType"), py::kw_only(),
+                 py::arg("scalable") = py::none(),
+                 py::arg("scalable_dims") = py::none(),
+                 py::arg("loc") = py::none(), "Create a vector type")
+        .def_property_readonly(
+            "scalable",
+            [](MlirType self) { return mlirVectorTypeIsScalable(self); })
+        .def_property_readonly("scalable_dims", [](MlirType self) {
+          std::vector<bool> scalableDims;
+          size_t rank = static_cast<size_t>(mlirShapedTypeGetRank(self));
+          scalableDims.reserve(rank);
+          for (size_t i = 0; i < rank; ++i)
+            scalableDims.push_back(mlirVectorTypeIsDimScalable(self, i));
+          return scalableDims;
+        });
+  }
+
+private:
+  static PyVectorType get(std::vector<int64_t> shape, PyType &elementType,
+                          std::optional<py::list> scalable,
+                          std::optional<std::vector<int64_t>> scalableDims,
+                          DefaultingPyLocation loc) {
+    if (scalable && scalableDims) {
+      throw py::value_error("'scalable' and 'scalable_dims' kwargs "
+                            "are mutually exclusive.");
+    }
+
+    PyMlirContext::ErrorCapture errors(loc->getContext());
+    MlirType type;
+    if (scalable) {
+      if (scalable->size() != shape.size())
+        throw py::value_error("Expected len(scalable) == len(shape).");
+
+      SmallVector<bool> scalableDimFlags = llvm::to_vector(llvm::map_range(
+          *scalable, [](const py::handle &h) { return h.cast<bool>(); }));
+      type = mlirVectorTypeGetScalableChecked(loc, shape.size(), shape.data(),
+                                              scalableDimFlags.data(),
+                                              elementType);
+    } else if (scalableDims) {
+      SmallVector<bool> scalableDimFlags(shape.size(), false);
+      for (int64_t dim : *scalableDims) {
+        if (static_cast<size_t>(dim) >= scalableDimFlags.size() || dim < 0)
+          throw py::value_error("Scalable dimension index out of bounds.");
+        scalableDimFlags[dim] = true;
+      }
+      type = mlirVectorTypeGetScalableChecked(loc, shape.size(), shape.data(),
+                                              scalableDimFlags.data(),
+                                              elementType);
+    } else {
+      type = mlirVectorTypeGetChecked(loc, shape.size(), shape.data(),
+                                      elementType);
+    }
+    if (mlirTypeIsNull(type))
+      throw MLIRError("Invalid type", errors.take());
+    return PyVectorType(elementType.getContext(), type);
   }
 };
 
diff --git a/mlir/lib/CAPI/IR/BuiltinTypes.cpp b/mlir/lib/CAPI/IR/BuiltinTypes.cpp
index 50266b4b523323..6e645188dac861 100644
--- a/mlir/lib/CAPI/IR/BuiltinTypes.cpp
+++ b/mlir/lib/CAPI/IR/BuiltinTypes.cpp
@@ -281,6 +281,31 @@ MlirType mlirVectorTypeGetChecked(MlirLocation loc, intptr_t rank,
       unwrap(elementType)));
 }
 
+MlirType mlirVectorTypeGetScalable(intptr_t rank, const int64_t *shape,
+                                   const bool *scalable, MlirType elementType) {
+  return wrap(VectorType::get(
+      llvm::ArrayRef(shape, static_cast<size_t>(rank)), unwrap(elementType),
+      llvm::ArrayRef(scalable, static_cast<size_t>(rank))));
+}
+
+MlirType mlirVectorTypeGetScalableChecked(MlirLocation loc, intptr_t rank,
+                                          const int64_t *shape,
+                                          const bool *scalable,
+                                          MlirType elementType) {
+  return wrap(VectorType::getChecked(
+      unwrap(loc), llvm::ArrayRef(shape, static_cast<size_t>(rank)),
+      unwrap(elementType),
+      llvm::ArrayRef(scalable, static_cast<size_t>(rank))));
+}
+
+bool mlirVectorTypeIsScalable(MlirType type) {
+  return unwrap(type).cast<VectorType>().isScalable();
+}
+
+bool mlirVectorTypeIsDimScalable(MlirType type, intptr_t dim) {
+  return unwrap(type).cast<VectorType>().getScalableDims()[dim];
+}
+
 //===----------------------------------------------------------------------===//
 // Ranked / Unranked tensor type.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/CAPI/ir.c b/mlir/test/CAPI/ir.c
index 8d5dcbf62e85e2..315458a08b613e 100644
--- a/mlir/test/CAPI/ir.c
+++ b/mlir/test/CAPI/ir.c
@@ -746,13 +746,27 @@ static int printBuiltinTypes(MlirContext ctx) {
   fprintf(stderr, "\n");
   // CHECK: vector<2x3xf32>
 
+  // Scalable vector type.
+  bool scalable[] = {false, true};
+  MlirType scalableVector = mlirVectorTypeGetScalable(
+      sizeof(shape) / sizeof(int64_t), shape, scalable, f32);
+  if (!mlirTypeIsAVector(scalableVector))
+    return 16;
+  if (!mlirVectorTypeIsScalable(scalableVector) ||
+      mlirVectorTypeIsDimScalable(scalableVector, 0) ||
+      !mlirVectorTypeIsDimScalable(scalableVector, 1))
+    return 17;
+  mlirTypeDump(scalableVector);
+  fprintf(stderr, "\n");
+  // CHECK: vector<2x[3]xf32>
+
   // Ranked tensor type.
   MlirType rankedTensor = mlirRankedTensorTypeGet(
       sizeof(shape) / sizeof(int64_t), shape, f32, mlirAttributeGetNull());
   if (!mlirTypeIsATensor(rankedTensor) ||
       !mlirTypeIsARankedTensor(rankedTensor) ||
       !mlirAttributeIsNull(mlirRankedTensorTypeGetEncoding(rankedTensor)))
-    return 16;
+    return 18;
   mlirTypeDump(rankedTensor);
   fprintf(stderr, "\n");
   // CHECK: tensor<2x3xf32>
@@ -762,7 +776,7 @@ static int printBuiltinTypes(MlirContext ctx) {
   if (!mlirTypeIsATensor(unrankedTensor) ||
       !mlirTypeIsAUnrankedTensor(unrankedTensor) ||
       mlirShapedTypeHasRank(unrankedTensor))
-    return 17;
+    return 19;
   mlirTypeDump(unrankedTensor);
   fprintf(stderr, "\n");
   // CHECK: tensor<*xf32>
@@ -773,7 +787,7 @@ static int printBuiltinTypes(MlirContext ctx) {
       f32, sizeof(shape) / sizeof(int64_t), shape, memSpace2);
   if (!mlirTypeIsAMemRef(memRef) ||
       !mlirAttributeEqual(mlirMemRefTypeGetMemorySpace(memRef), memSpace2))
-    return 18;
+    return 20;
   mlirTypeDump(memRef);
   fprintf(stderr, "\n");
   // CHECK: memref<2x3xf32, 2>
@@ -785,7 +799,7 @@ static int printBuiltinTypes(MlirContext ctx) {
       mlirTypeIsAMemRef(unrankedMemRef) ||
       !mlirAttributeEqual(mlirUnrankedMemrefGetMemorySpace(unrankedMemRef),
                           memSpace4))
-    return 19;
+    return 21;
   mlirTypeDump(unrankedMemRef);
   fprintf(stderr, "\n");
   // CHECK: memref<*xf32, 4>
@@ -796,7 +810,7 @@ static int printBuiltinTypes(MlirContext ctx) {
   if (!mlirTypeIsATuple(tuple) || mlirTupleTypeGetNumTypes(tuple) != 2 ||
       !mlirTypeEqual(mlirTupleTypeGetType(tuple, 0), unrankedMemRef) ||
       !mlirTypeEqual(mlirTupleTypeGetType(tuple, 1), f32))
-    return 20;
+    return 22;
   mlirTypeDump(tuple);
   fprintf(stderr, "\n");
   // CHECK: tuple<memref<*xf32, 4>, f32>
@@ -808,16 +822,16 @@ static int printBuiltinTypes(MlirContext ctx) {
                              mlirIntegerTypeGet(ctx, 64)};
   MlirType funcType = mlirFunctionTypeGet(ctx, 2, funcInputs, 3, funcResults);
   if (mlirFunctionTypeGetNumInputs(funcType) != 2)
-    return 21;
+    return 23;
   if (mlirFunctionTypeGetNumResults(funcType) != 3)
-    return 22;
+    return 24;
   if (!mlirTypeEqual(funcInputs[0], mlirFunctionTypeGetInput(funcType, 0)) ||
       !mlirTypeEqual(funcInputs[1], mlirFunctionTypeGetInput(funcType, 1)))
-    return 23;
+    return 25;
   if (!mlirTypeEqual(funcResults[0], mlirFunctionTypeGetResult(funcType, 0)) ||
       !mlirTypeEqual(funcResults[1], mlirFunctionTypeGetResult(funcType, 1)) ||
       !mlirTypeEqual(funcResults[2], mlirFunctionTypeGetResult(funcType, 2)))
-    return 24;
+    return 26;
   mlirTypeDump(funcType);
   fprintf(stderr, "\n");
   // CHECK: (index, i1) -> (i16, i32, i64)
@@ -832,7 +846,7 @@ static int printBuiltinTypes(MlirContext ctx) {
       !mlirStringRefEqual(mlirOpaqueTypeGetDialectNamespace(opaque),
                           namespace) ||
       !mlirStringRefEqual(mlirOpaqueTypeGetData(opaque), data))
-    return 25;
+    return 27;
   mlirTypeDump(opaque);
   fprintf(stderr, "\n");
   // CHECK: !dialect.type
diff --git a/mlir/test/python/ir/builtin_types.py b/mlir/test/python/ir/builtin_types.py
index 672418b5383ae4..4c891a2ca2ab9a 100644
--- a/mlir/test/python/ir/builtin_types.py
+++ b/mlir/test/python/ir/builtin_types.py
@@ -300,7 +300,7 @@ def testVectorType():
 
         none = NoneType.get()
         try:
-            vector_invalid = VectorType.get(shape, none)
+            VectorType.get(shape, none)
         except MLIRError as e:
             # CHECK: Invalid type:
             # CHECK: error: unknown: vector elements must be int/index/float type but got 'none'
@@ -308,6 +308,46 @@ def testVectorType():
         else:
             print("Exception not produced")
 
+        scalable_1 = VectorType.get(shape, f32, scalable=[False, True])
+        scalable_2 = VectorType.get([2, 3, 4], f32, scalable=[True, False, True])
+        assert scalable_1.scalable
+        assert scalable_2.scalable
+        assert scalable_1.scalable_dims == [False, True]
+        assert scalable_2.scalable_dims == [True, False, True]
+        # CHECK: scalable 1: vector<2x[3]xf32>
+        print("scalable 1: ", scalable_1)
+        # CHECK: scalable 2: vector<[2]x3x[4]xf32>
+        print("scalable 2: ", scalable_2)
+
+        scalable_3 = VectorType.get(shape, f32, scalable_dims=[1])
+        scalable_4 = VectorType.get([2, 3, 4], f32, scalable_dims=[0, 2])
+        assert scalable_3 == scalable_1
+        assert scalable_4 == scalable_2
+
+        try:
+            VectorType.get(shape, f32, scalable=[False, True, True])
+        except ValueError as e:
+            # CHECK: Expected len(scalable) == len(shape).
+            print(e)
+        else:
+            print("Exception not produced")
+
+        try:
+            VectorType.get(shape, f32, scalable=[False, True], scalable_dims=[1])
+        except ValueError as e:
+            # CHECK: kwargs are mutually exclusive.
+            print(e)
+        else:
+            print("Exception not produced")
+
+        try:
+            VectorType.get(shape, f32, scalable=[False, True], scalable_dims=[42])
+        except ValueError as e:
+            # CHECK: Scalable dimension index out of bounds.
+            print(e)
+        else:
+            print("Exception not produced")
+
 
 # CHECK-LABEL: TEST: testRankedTensorType
 @run
@@ -337,7 +377,6 @@ def testRankedTensorType():
         assert RankedTensorType.get(shape, f32).encoding is None
 
 
-
 # CHECK-LABEL: TEST: testUnrankedTensorType
 @run
 def testUnrankedTensorType():

From 8a0535fbbe21cb69c8bf53d7ebc76bbe51f3152e Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 6 Nov 2023 13:18:02 +0100
Subject: [PATCH 04/40] [bazel] Fix the bazel build for mlir:SPIRVTarget
 target.

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 413c516185a071..e93239251d4326 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6533,6 +6533,7 @@ cc_library(
     deps = [
         ":GPUDialect",
         ":SPIRVDialect",
+        ":Support",
     ],
 )
 

From d53ff36740b96dbcad36f6da9ea6f65c45021c04 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Mon, 6 Nov 2023 13:20:32 +0100
Subject: [PATCH 05/40] [mlir][bazel] Fix build for
 d9dadfda85aad7942f6b61783402b9f2d5173782

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index e93239251d4326..41a3ef047834bd 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -5945,6 +5945,7 @@ cc_library(
         ":NVVMToLLVMIRTranslation",
         ":TargetLLVM",
         ":ToLLVMIRTranslation",
+        "//llvm:NVPTXCodeGen",
         "//llvm:Support",
     ],
 )

From ba13978f4269f4f35f7d1c32bb529f19bf1d8f46 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Mon, 6 Nov 2023 12:30:17 +0000
Subject: [PATCH 06/40] [mlir] fix broken python test

---
 mlir/test/python/ir/builtin_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/python/ir/builtin_types.py b/mlir/test/python/ir/builtin_types.py
index 4c891a2ca2ab9a..d4fed86b4f135e 100644
--- a/mlir/test/python/ir/builtin_types.py
+++ b/mlir/test/python/ir/builtin_types.py
@@ -341,7 +341,7 @@ def testVectorType():
             print("Exception not produced")
 
         try:
-            VectorType.get(shape, f32, scalable=[False, True], scalable_dims=[42])
+            VectorType.get(shape, f32, scalable_dims=[42])
         except ValueError as e:
             # CHECK: Scalable dimension index out of bounds.
             print(e)

From b142501e9267e308b6b6e5a051f327aa1e0ee3b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Degioanni?=
 <30992420+Moxinilian@users.noreply.github.com>
Date: Mon, 6 Nov 2023 13:53:16 +0100
Subject: [PATCH 07/40] [mlir][memref] Fix segfault in SROA (#71063)

Fixes #70902.

The out of bounds check in the SROA implementation for MemRef was not
actually testing anything because it only operated on a store op which
does not trigger the logic by itself. It is now checked for real and the
underlying bug is fixed.

I checked the LLVM implementation just in case but this should not
happen as out-of-bound checks happen in GEP's verifier there.
---
 .../Dialect/MemRef/IR/MemRefMemorySlot.cpp    | 30 ++++++++++++-------
 mlir/test/Dialect/MemRef/sroa.mlir            | 25 ++++++++++++++--
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp
index 93ec2bcdf58fa4..be301c191d5139 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp
@@ -187,14 +187,22 @@ DeletionKind memref::LoadOp::removeBlockingUses(
   return DeletionKind::Delete;
 }
 
-/// Returns the index of a memref in attribute form, given its indices.
+/// Returns the index of a memref in attribute form, given its indices. Returns
+/// a null pointer if whether the indices form a valid index for the provided
+/// MemRefType cannot be computed. The indices must come from a valid memref
+/// StoreOp or LoadOp.
 static Attribute getAttributeIndexFromIndexOperands(MLIRContext *ctx,
-                                                    ValueRange indices) {
+                                                    ValueRange indices,
+                                                    MemRefType memrefType) {
   SmallVector<Attribute> index;
-  for (Value coord : indices) {
+  for (auto [coord, dimSize] : llvm::zip(indices, memrefType.getShape())) {
     IntegerAttr coordAttr;
     if (!matchPattern(coord, m_Constant<IntegerAttr>(&coordAttr)))
       return {};
+    // MemRefType shape dimensions are always positive (checked by verifier).
+    std::optional<uint64_t> coordInt = coordAttr.getValue().tryZExtValue();
+    if (!coordInt || coordInt.value() >= static_cast<uint64_t>(dimSize))
+      return {};
     index.push_back(coordAttr);
   }
   return ArrayAttr::get(ctx, index);
@@ -205,8 +213,8 @@ bool memref::LoadOp::canRewire(const DestructurableMemorySlot &slot,
                                SmallVectorImpl<MemorySlot> &mustBeSafelyUsed) {
   if (slot.ptr != getMemRef())
     return false;
-  Attribute index =
-      getAttributeIndexFromIndexOperands(getContext(), getIndices());
+  Attribute index = getAttributeIndexFromIndexOperands(
+      getContext(), getIndices(), getMemRefType());
   if (!index)
     return false;
   usedIndices.insert(index);
@@ -216,8 +224,8 @@ bool memref::LoadOp::canRewire(const DestructurableMemorySlot &slot,
 DeletionKind memref::LoadOp::rewire(const DestructurableMemorySlot &slot,
                                     DenseMap<Attribute, MemorySlot> &subslots,
                                     RewriterBase &rewriter) {
-  Attribute index =
-      getAttributeIndexFromIndexOperands(getContext(), getIndices());
+  Attribute index = getAttributeIndexFromIndexOperands(
+      getContext(), getIndices(), getMemRefType());
   const MemorySlot &memorySlot = subslots.at(index);
   rewriter.updateRootInPlace(*this, [&]() {
     setMemRef(memorySlot.ptr);
@@ -258,8 +266,8 @@ bool memref::StoreOp::canRewire(const DestructurableMemorySlot &slot,
                                 SmallVectorImpl<MemorySlot> &mustBeSafelyUsed) {
   if (slot.ptr != getMemRef() || getValue() == slot.ptr)
     return false;
-  Attribute index =
-      getAttributeIndexFromIndexOperands(getContext(), getIndices());
+  Attribute index = getAttributeIndexFromIndexOperands(
+      getContext(), getIndices(), getMemRefType());
   if (!index || !slot.elementPtrs.contains(index))
     return false;
   usedIndices.insert(index);
@@ -269,8 +277,8 @@ bool memref::StoreOp::canRewire(const DestructurableMemorySlot &slot,
 DeletionKind memref::StoreOp::rewire(const DestructurableMemorySlot &slot,
                                      DenseMap<Attribute, MemorySlot> &subslots,
                                      RewriterBase &rewriter) {
-  Attribute index =
-      getAttributeIndexFromIndexOperands(getContext(), getIndices());
+  Attribute index = getAttributeIndexFromIndexOperands(
+      getContext(), getIndices(), getMemRefType());
   const MemorySlot &memorySlot = subslots.at(index);
   rewriter.updateRootInPlace(*this, [&]() {
     setMemRef(memorySlot.ptr);
diff --git a/mlir/test/Dialect/MemRef/sroa.mlir b/mlir/test/Dialect/MemRef/sroa.mlir
index d78053d8ea777e..40ab9b3483b833 100644
--- a/mlir/test/Dialect/MemRef/sroa.mlir
+++ b/mlir/test/Dialect/MemRef/sroa.mlir
@@ -132,9 +132,9 @@ func.func @no_dynamic_shape(%arg0: i32, %arg1: i32) -> i32 {
 
 // -----
 
-// CHECK-LABEL: func.func @no_out_of_bounds
+// CHECK-LABEL: func.func @no_out_of_bound_write
 // CHECK-SAME: (%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32)
-func.func @no_out_of_bounds(%arg0: i32, %arg1: i32) -> i32 {
+func.func @no_out_of_bound_write(%arg0: i32, %arg1: i32) -> i32 {
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   %c0 = arith.constant 0 : index
   // CHECK: %[[C100:.*]] = arith.constant 100 : index
@@ -152,3 +152,24 @@ func.func @no_out_of_bounds(%arg0: i32, %arg1: i32) -> i32 {
   // CHECK: return %[[RES]] : i32
   return %res : i32
 }
+
+// -----
+
+// CHECK-LABEL: func.func @no_out_of_bound_load
+// CHECK-SAME: (%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32)
+func.func @no_out_of_bound_load(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  %c0 = arith.constant 0 : index
+  // CHECK: %[[C100:.*]] = arith.constant 100 : index
+  %c100 = arith.constant 100 : index
+  // CHECK-NOT: = memref.alloca()
+  // CHECK: %[[ALLOCA:.*]] = memref.alloca() : memref<2xi32>
+  // CHECK-NOT: = memref.alloca()
+  %alloca = memref.alloca() : memref<2xi32>
+  // CHECK: memref.store %[[ARG0]], %[[ALLOCA]][%[[C0]]]
+  memref.store %arg0, %alloca[%c0] : memref<2xi32>
+  // CHECK: %[[RES:.*]] = memref.load %[[ALLOCA]][%[[C100]]]
+  %res = memref.load %alloca[%c100] : memref<2xi32>
+  // CHECK: return %[[RES]] : i32
+  return %res : i32
+}

From 046c57e705e082566e2d0073a2f21471a2b06a19 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Mon, 6 Nov 2023 13:49:04 +0100
Subject: [PATCH 08/40] Revert "[SLP]Improve tryToGatherExtractElements by
 using per-register analysis."

This causes asserts:

  llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:10082:
  Value *llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::adjustExtracts(
    const TreeEntry *, MutableArrayRef<int>, unsigned int, bool &):
  Assertion `Part == 0 && "Expected firs part."' failed.

See comment on the code review.

> Currently tryToGatherExtractElements function analyzes the whole vector,
> regrdless number of actual registers, used in this vector. It may
> prevent some optimizations, because per-register analysis may allow to
> simplify the final code by reusing more already emitted vectors and
> better shuffles.
>
> Differential Revision: https://reviews.llvm.org/D148855

This reverts commit 9dfdbd788707edc8c39eb2bff16004aba1f3586b.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 534 ++++++++----------
 .../AArch64/extractelements-to-shuffle.ll     | 135 +++--
 .../SLPVectorizer/X86/crash_clear_undefs.ll   |   2 +-
 .../SLPVectorizer/X86/hadd-inseltpoison.ll    | 152 +----
 .../test/Transforms/SLPVectorizer/X86/hadd.ll | 152 +----
 .../SLPVectorizer/X86/hsub-inseltpoison.ll    | 153 +----
 .../test/Transforms/SLPVectorizer/X86/hsub.ll | 153 +----
 .../X86/reused-extractelements.ll             |  23 +-
 8 files changed, 426 insertions(+), 878 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1264627ee721ad..d9f016c98aeb17 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -662,36 +662,6 @@ tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
   return Res;
 }
 
-/// Tries to find extractelement instructions with constant indices from fixed
-/// vector type and gather such instructions into a bunch, which highly likely
-/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
-/// successful, the matched scalars are replaced by poison values in \p VL for
-/// future analysis.
-static SmallVector<std::optional<TTI::ShuffleKind>>
-tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
-                           SmallVectorImpl<int> &Mask, unsigned NumParts) {
-  assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
-  SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
-  Mask.assign(VL.size(), PoisonMaskElem);
-  unsigned SliceSize = VL.size() / NumParts;
-  for (unsigned Part = 0; Part < NumParts; ++Part) {
-    // Scan list of gathered scalars for extractelements that can be represented
-    // as shuffles.
-    MutableArrayRef<Value *> SubVL =
-        MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
-    SmallVector<int> SubMask;
-    std::optional<TTI::ShuffleKind> Res =
-        tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
-    ShufflesRes[Part] = Res;
-    copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
-  }
-  if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
-        return Res.has_value();
-      }))
-    ShufflesRes.clear();
-  return ShufflesRes;
-}
-
 namespace {
 
 /// Main data required for vectorization of instructions.
@@ -7182,80 +7152,101 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
   };
 
-  /// Compute the cost of creating a vector containing the extracted values from
-  /// \p VL.
-  InstructionCost
-  computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
-                     ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
-                     unsigned NumParts) {
-    assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
-    unsigned NumElts =
-        std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
-          auto *EE = dyn_cast<ExtractElementInst>(V);
-          if (!EE)
-            return Sz;
-          auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
-          return std::max(Sz, VecTy->getNumElements());
-        });
-    unsigned NumSrcRegs = TTI.getNumberOfParts(
-        FixedVectorType::get(VL.front()->getType(), NumElts));
-    if (NumSrcRegs == 0)
-      NumSrcRegs = 1;
-    // FIXME: this must be moved to TTI for better estimation.
-    unsigned EltsPerVector = PowerOf2Ceil(std::max(
-        divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
-    auto CheckPerRegistersShuffle =
-        [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
-      DenseSet<int> RegIndices;
-      // Check that if trying to permute same single/2 input vectors.
-      TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
-      int FirstRegId = -1;
-      for (int &I : Mask) {
-        if (I == PoisonMaskElem)
-          continue;
-        int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
-        if (FirstRegId < 0)
-          FirstRegId = RegId;
-        RegIndices.insert(RegId);
-        if (RegIndices.size() > 2)
-          return std::nullopt;
-        if (RegIndices.size() == 2)
-          ShuffleKind = TTI::SK_PermuteTwoSrc;
-        I = (I % NumElts) % EltsPerVector +
-            (RegId == FirstRegId ? 0 : EltsPerVector);
-      }
-      return ShuffleKind;
-    };
+  /// Compute the cost of creating a vector of type \p VecTy containing the
+  /// extracted values from \p VL.
+  InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
+                                     TTI::ShuffleKind ShuffleKind) {
+    unsigned NumElts = 0;
+    for (Value *V : VL) {
+      auto *EE = dyn_cast<ExtractElementInst>(V);
+      if (!EE)
+        continue;
+      auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
+      NumElts = std::max(NumElts, VecTy->getNumElements());
+    }
+    assert(NumElts > 0 &&
+           "Expected at least 1-element fixed length vector(s).");
+    auto *VecTy = FixedVectorType::get(VL.front()->getType(), NumElts);
+    unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
+    if (!NumOfParts || NumElts < NumOfParts)
+      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
+    unsigned EltsPerVector = PowerOf2Ceil(divideCeil(NumElts, NumOfParts));
+    int ValNum = -1;
+    int ValIdx = -1;
+    // Check that if trying to permute 2 input vectors (which may result in
+    // several vector registers), each per-register subvector is the result of
+    // the permutation of 2 single registers.
+    if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc &&
+        !all_of(enumerate(Mask), [&](auto &&Arg) {
+          if (Arg.value() == PoisonMaskElem)
+            return true;
+          int CurValNum = (Arg.value() % NumElts) / EltsPerVector;
+          int CurValIdx = Arg.index() / EltsPerVector;
+          if (ValIdx != CurValIdx) {
+            ValIdx = CurValIdx;
+            ValNum = CurValNum;
+            return true;
+          }
+          return CurValNum == ValNum;
+        }))
+      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
+
     InstructionCost Cost = 0;
 
     // Process extracts in blocks of EltsPerVector to check if the source vector
     // operand can be re-used directly. If not, add the cost of creating a
     // shuffle to extract the values into a vector register.
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
-      if (!ShuffleKinds[Part])
-        continue;
-      ArrayRef<int> MaskSlice =
-          Mask.slice(Part * EltsPerVector,
-                     (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
-                         ? Mask.size() % EltsPerVector
-                         : EltsPerVector);
-      SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
-      copy(MaskSlice, SubMask.begin());
-      std::optional<TTI::ShuffleKind> RegShuffleKind =
-          CheckPerRegistersShuffle(SubMask);
-      if (!RegShuffleKind) {
-        Cost += TTI.getShuffleCost(
-            *ShuffleKinds[Part],
-            FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
-        continue;
+    auto *RegisterVecTy =
+        FixedVectorType::get(VL.front()->getType(), EltsPerVector);
+    SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);
+    TTI::ShuffleKind RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
+    Value *VecBase = nullptr;
+    bool IsIdentity = true;
+    for (auto [Idx, V] : enumerate(VL)) {
+      // Reached the start of a new vector registers.
+      if (Idx % EltsPerVector == 0) {
+        RegMask.assign(EltsPerVector, PoisonMaskElem);
+        RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
+        VecBase = nullptr;
       }
-      if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
-          !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
-        Cost += TTI.getShuffleCost(
-            *RegShuffleKind,
-            FixedVectorType::get(VL.front()->getType(), EltsPerVector),
-            SubMask);
+
+      // Need to exclude undefs from analysis.
+      if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)
+        continue;
+
+      // Check all extracts for a vector register on the target directly
+      // extract values in order.
+      unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
+      unsigned PrevIdx = CurrentIdx;
+      if (Idx % EltsPerVector != 0 && !isa<UndefValue>(VL[Idx - 1]) &&
+          Mask[Idx - 1] != PoisonMaskElem)
+        PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])) + 1;
+      if (!VecBase) {
+        VecBase = cast<ExtractElementInst>(V)->getVectorOperand();
+        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
+        IsIdentity = CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+      } else if (VecBase != cast<ExtractElementInst>(V)->getVectorOperand()) {
+        IsIdentity = false;
+        RegisterSK = TargetTransformInfo::SK_PermuteTwoSrc;
+        RegMask[Idx % EltsPerVector] =
+            CurrentIdx % EltsPerVector + EltsPerVector;
+      } else {
+        IsIdentity &= PrevIdx == CurrentIdx &&
+                      CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
       }
+
+      if (IsIdentity)
+        continue;
+
+      // Skip all indices, except for the last index per vector block.
+      if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
+        continue;
+
+      // If we have a series of extracts which are not consecutive and hence
+      // cannot re-use the source vector register directly, compute the shuffle
+      // cost to extract the vector with EltsPerVector elements.
+      Cost += TTI.getShuffleCost(RegisterSK, RegisterVecTy, RegMask);
     }
     return Cost;
   }
@@ -7473,76 +7464,90 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                        SmallPtrSetImpl<Value *> &CheckedExtracts)
       : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
         R(R), CheckedExtracts(CheckedExtracts) {}
-  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
-                        ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
-                        unsigned NumParts) {
+  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,
+                        TTI::ShuffleKind ShuffleKind) {
     if (Mask.empty())
       return nullptr;
     Value *VecBase = nullptr;
     ArrayRef<Value *> VL = E->Scalars;
+    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
     // If the resulting type is scalarized, do not adjust the cost.
-    if (NumParts == VL.size())
+    unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
+    if (VecNumParts == VecTy->getNumElements())
       return nullptr;
-    // Check if it can be considered reused if same extractelements were
-    // vectorized already.
-    bool PrevNodeFound = any_of(
-        ArrayRef(R.VectorizableTree).take_front(E->Idx),
-        [&](const std::unique_ptr<TreeEntry> &TE) {
-          return ((!TE->isAltShuffle() &&
-                   TE->getOpcode() == Instruction::ExtractElement) ||
-                  TE->State == TreeEntry::NeedToGather) &&
-                 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
-                   return VL.size() > Data.index() &&
-                          (Mask[Data.index()] == PoisonMaskElem ||
-                           isa<UndefValue>(VL[Data.index()]) ||
-                           Data.value() == VL[Data.index()]);
-                 });
-        });
-    unsigned SliceSize = VL.size() / NumParts;
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
-      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
-      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
-        // Ignore non-extractelement scalars.
-        if (isa<UndefValue>(V) ||
-            (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
-          continue;
-        // If all users of instruction are going to be vectorized and this
-        // instruction itself is not going to be vectorized, consider this
-        // instruction as dead and remove its cost from the final cost of the
-        // vectorized tree.
-        // Also, avoid adjusting the cost for extractelements with multiple uses
-        // in different graph entries.
-        const TreeEntry *VE = R.getTreeEntry(V);
-        if (!CheckedExtracts.insert(V).second ||
-            !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
-            (VE && VE != E))
+    DenseMap<Value *, int> ExtractVectorsTys;
+    for (auto [I, V] : enumerate(VL)) {
+      // Ignore non-extractelement scalars.
+      if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))
+        continue;
+      // If all users of instruction are going to be vectorized and this
+      // instruction itself is not going to be vectorized, consider this
+      // instruction as dead and remove its cost from the final cost of the
+      // vectorized tree.
+      // Also, avoid adjusting the cost for extractelements with multiple uses
+      // in different graph entries.
+      const TreeEntry *VE = R.getTreeEntry(V);
+      if (!CheckedExtracts.insert(V).second ||
+          !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
+          (VE && VE != E))
+        continue;
+      auto *EE = cast<ExtractElementInst>(V);
+      VecBase = EE->getVectorOperand();
+      std::optional<unsigned> EEIdx = getExtractIndex(EE);
+      if (!EEIdx)
+        continue;
+      unsigned Idx = *EEIdx;
+      if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
+        auto It =
+            ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
+        It->getSecond() = std::min<int>(It->second, Idx);
+      }
+      // Take credit for instruction that will become dead.
+      if (EE->hasOneUse()) {
+        Instruction *Ext = EE->user_back();
+        if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+              return isa<GetElementPtrInst>(U);
+            })) {
+          // Use getExtractWithExtendCost() to calculate the cost of
+          // extractelement/ext pair.
+          Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+                                               EE->getVectorOperandType(), Idx);
+          // Add back the cost of s|zext which is subtracted separately.
+          Cost += TTI.getCastInstrCost(
+              Ext->getOpcode(), Ext->getType(), EE->getType(),
+              TTI::getCastContextHint(Ext), CostKind, Ext);
           continue;
-        auto *EE = cast<ExtractElementInst>(V);
-        VecBase = EE->getVectorOperand();
-        std::optional<unsigned> EEIdx = getExtractIndex(EE);
-        if (!EEIdx)
+        }
+      }
+      Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
+                                     Idx);
+    }
+    // Add a cost for subvector extracts/inserts if required.
+    for (const auto &Data : ExtractVectorsTys) {
+      auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
+      unsigned NumElts = VecTy->getNumElements();
+      if (Data.second % NumElts == 0)
+        continue;
+      if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
+        unsigned Idx = (Data.second / NumElts) * NumElts;
+        unsigned EENumElts = EEVTy->getNumElements();
+        if (Idx % NumElts == 0)
           continue;
-        unsigned Idx = *EEIdx;
-        // Take credit for instruction that will become dead.
-        if (EE->hasOneUse() || !PrevNodeFound) {
-          Instruction *Ext = EE->user_back();
-          if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
-                return isa<GetElementPtrInst>(U);
-              })) {
-            // Use getExtractWithExtendCost() to calculate the cost of
-            // extractelement/ext pair.
-            Cost -=
-                TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
-                                             EE->getVectorOperandType(), Idx);
-            // Add back the cost of s|zext which is subtracted separately.
-            Cost += TTI.getCastInstrCost(
-                Ext->getOpcode(), Ext->getType(), EE->getType(),
-                TTI::getCastContextHint(Ext), CostKind, Ext);
-            continue;
-          }
+        if (Idx + NumElts <= EENumElts) {
+          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                     EEVTy, std::nullopt, CostKind, Idx, VecTy);
+        } else {
+          // Need to round up the subvector type vectorization factor to avoid a
+          // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
+          // <= EENumElts.
+          auto *SubVT =
+              FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
+          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                     EEVTy, std::nullopt, CostKind, Idx, SubVT);
         }
-        Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
-                                       CostKind, Idx);
+      } else {
+        Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+                                   VecTy, std::nullopt, CostKind, 0, EEVTy);
       }
     }
     // Check that gather of extractelements can be represented as just a
@@ -7550,9 +7555,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // Found the bunch of extractelement instructions that must be gathered
     // into a vector and can be represented as a permutation elements in a
     // single input vector or of 2 input vectors.
-    // Done for reused if same extractelements were vectorized already.
-    if (!PrevNodeFound)
-      Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
+    Cost += computeExtractCost(VL, Mask, ShuffleKind);
     InVectors.assign(1, E);
     CommonMask.assign(Mask.begin(), Mask.end());
     transformMaskAfterShuffle(CommonMask, CommonMask);
@@ -7674,7 +7677,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     assert((IsFinalized || CommonMask.empty()) &&
            "Shuffle construction must be finalized.");
   }
-  };
+};
 
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
@@ -7735,41 +7738,40 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       reorderScalars(GatheredScalars, ReorderMask);
     SmallVector<int> Mask;
     SmallVector<int> ExtractMask;
+    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
     SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
     SmallVector<SmallVector<const TreeEntry *>> Entries;
-    SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
     // Check for gathered extracts.
+    ExtractShuffle =
+        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
+
     bool Resized = false;
     unsigned NumParts = TTI->getNumberOfParts(VecTy);
     if (NumParts == 0 || NumParts >= GatheredScalars.size())
       NumParts = 1;
-    if (!all_of(GatheredScalars, UndefValue::classof)) {
-      ExtractShuffles =
-          tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
-      if (!ExtractShuffles.empty()) {
-        if (Value *VecBase = Estimator.adjustExtracts(
-                E, ExtractMask, ExtractShuffles, NumParts)) {
-          if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-            if (VF == VecBaseTy->getNumElements() &&
-                GatheredScalars.size() != VF) {
-              Resized = true;
-              GatheredScalars.append(VF - GatheredScalars.size(),
-                                     PoisonValue::get(ScalarTy));
-            }
+    if (Value *VecBase = Estimator.adjustExtracts(
+            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
+      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+          Resized = true;
+          GatheredScalars.append(VF - GatheredScalars.size(),
+                                 PoisonValue::get(ScalarTy));
         }
-      }
-
-      // Do not try to look for reshuffled loads for gathered loads (they will
-      // be handled later), for vectorized scalars, and cases, which are
-      // definitely not profitable (splats and small gather nodes.)
-      if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
-          E->isAltShuffle() ||
-          all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
-          isSplat(E->Scalars) ||
-          (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-        GatherShuffles =
-            isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+    } else if (ExtractShuffle &&
+               TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
+      copy(VL, GatheredScalars.begin());
     }
+
+    // Do not try to look for reshuffled loads for gathered loads (they will be
+    // handled later), for vectorized scalars, and cases, which are definitely
+    // not profitable (splats and small gather nodes.)
+    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+        E->isAltShuffle() ||
+        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+        isSplat(E->Scalars) ||
+        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
+      GatherShuffles =
+          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
     if (!GatherShuffles.empty()) {
       if (GatherShuffles.size() == 1 &&
           *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
@@ -10011,10 +10013,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
       : Builder(Builder), R(R) {}
 
   /// Adjusts extractelements after reusing them.
-  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
-                        unsigned NumParts, bool &UseVecBaseAsInput) {
-    UseVecBaseAsInput = false;
-    SmallPtrSet<Value *, 4> UniqueBases;
+  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
     Value *VecBase = nullptr;
     for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
       int Idx = Mask[I];
@@ -10022,7 +10021,6 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
       VecBase = EI->getVectorOperand();
-      UniqueBases.insert(VecBase);
       // If the only one use is vectorized - can delete the extractelement
       // itself.
       if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
@@ -10031,74 +10029,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       R.eraseInstruction(EI);
     }
-    if (NumParts == 1 || UniqueBases.size() == 1)
-      return VecBase;
-    UseVecBaseAsInput = true;
-    auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
-      for (auto [I, Idx] : enumerate(Mask))
-        if (Idx != PoisonMaskElem)
-          Idx = I;
-    };
-    // Perform multi-register vector shuffle, joining them into a single virtual
-    // long vector.
-    // Need to shuffle each part independently and then insert all this parts
-    // into a long virtual vector register, forming the original vector.
-    Value *Vec = nullptr;
-    SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
-    unsigned SliceSize = E->Scalars.size() / NumParts;
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
-      ArrayRef<Value *> VL =
-          ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
-      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
-      constexpr int MaxBases = 2;
-      SmallVector<Value *, MaxBases> Bases(MaxBases);
-#ifndef NDEBUG
-      int PrevSize = 0;
-#endif // NDEBUG
-      for (const auto [I, V]: enumerate(VL)) {
-        if (SubMask[I] == PoisonMaskElem)
-          continue;
-        Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
-        const int Size =
-            cast<FixedVectorType>(VecOp->getType())->getNumElements();
-#ifndef NDEBUG
-        assert((PrevSize == Size || PrevSize == 0) &&
-               "Expected vectors of the same size.");
-        PrevSize = Size;
-#endif // NDEBUG
-        Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
-      }
-      if (!Bases.front())
-        continue;
-      Value *SubVec;
-      if (Bases.back()) {
-        SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
-        TransformToIdentity(SubMask);
-      } else {
-        SubVec = Bases.front();
-      }
-      if (!Vec) {
-        Vec = SubVec;
-        assert(Part == 0 && "Expected firs part.");
-        copy(SubMask, VecMask.begin());
-      } else {
-        unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
-        if (Vec->getType() != SubVec->getType()) {
-          unsigned SubVecVF =
-              cast<FixedVectorType>(SubVec->getType())->getNumElements();
-          VF = std::max(VF, SubVecVF);
-        }
-        // Adjust SubMask.
-        for (auto [I, Idx] : enumerate(SubMask))
-          if (Idx != PoisonMaskElem)
-            Idx += VF;
-        copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
-        Vec = createShuffle(Vec, SubVec, VecMask);
-        TransformToIdentity(VecMask);
-      }
-    }
-    copy(VecMask, Mask.begin());
-    return Vec;
+    return VecBase;
   }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
@@ -10441,37 +10372,29 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   BVTy ShuffleBuilder(Params...);
   ResTy Res = ResTy();
   SmallVector<int> Mask;
-  SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
-  SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
-  Value *ExtractVecBase = nullptr;
-  bool UseVecBaseAsInput = false;
+  SmallVector<int> ExtractMask;
+  std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
   SmallVector<SmallVector<const TreeEntry *>> Entries;
   Type *ScalarTy = GatheredScalars.front()->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
-  unsigned NumParts = TTI->getNumberOfParts(VecTy);
+  unsigned NumParts = TTI->getNumberOfParts(
+      FixedVectorType::get(ScalarTy, GatheredScalars.size()));
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
     NumParts = 1;
   if (!all_of(GatheredScalars, UndefValue::classof)) {
     // Check for gathered extracts.
+    ExtractShuffle =
+        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
     bool Resized = false;
-    ExtractShuffles =
-        tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
-    if (!ExtractShuffles.empty()) {
-      if (Value *VecBase = ShuffleBuilder.adjustExtracts(
-              E, ExtractMask, NumParts, UseVecBaseAsInput)) {
-        ExtractVecBase = VecBase;
-        if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-          if (VF == VecBaseTy->getNumElements() &&
-              GatheredScalars.size() != VF) {
-            Resized = true;
-            GatheredScalars.append(VF - GatheredScalars.size(),
-                                   PoisonValue::get(ScalarTy));
-          }
-      }
-    }
+    if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))
+      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+          Resized = true;
+          GatheredScalars.append(VF - GatheredScalars.size(),
+                                 PoisonValue::get(ScalarTy));
+        }
     // Gather extracts after we check for full matched gathers only.
-    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
+    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
         E->isAltShuffle() ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
@@ -10622,35 +10545,30 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       }
     }
   };
-  if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
+  if (ExtractShuffle || !GatherShuffles.empty()) {
     bool IsNonPoisoned = true;
     bool IsUsedInExpr = true;
     Value *Vec1 = nullptr;
-    if (!ExtractShuffles.empty()) {
+    if (ExtractShuffle) {
       // Gather of extractelements can be represented as just a shuffle of
       // a single/two vectors the scalars are extracted from.
       // Find input vectors.
       Value *Vec2 = nullptr;
       for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
-        if (!Mask.empty() && Mask[I] != PoisonMaskElem)
+        if (ExtractMask[I] == PoisonMaskElem ||
+            (!Mask.empty() && Mask[I] != PoisonMaskElem)) {
           ExtractMask[I] = PoisonMaskElem;
-      }
-      if (UseVecBaseAsInput) {
-        Vec1 = ExtractVecBase;
-      } else {
-        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
-          if (ExtractMask[I] == PoisonMaskElem)
-            continue;
-          if (isa<UndefValue>(E->Scalars[I]))
-            continue;
-          auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
-          if (!Vec1) {
-            Vec1 = EI->getVectorOperand();
-          } else if (Vec1 != EI->getVectorOperand()) {
-            assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
-                   "Expected only 1 or 2 vectors shuffle.");
-            Vec2 = EI->getVectorOperand();
-          }
+          continue;
+        }
+        if (isa<UndefValue>(E->Scalars[I]))
+          continue;
+        auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+        if (!Vec1) {
+          Vec1 = EI->getVectorOperand();
+        } else if (Vec1 != EI->getVectorOperand()) {
+          assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+                 "Expected only 1 or 2 vectors shuffle.");
+          Vec2 = EI->getVectorOperand();
         }
       }
       if (Vec2) {
@@ -10711,14 +10629,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     int MSz = Mask.size();
     // Try to build constant vector and shuffle with it only if currently we
     // have a single permutation and more than 1 scalar constants.
-    bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
+    bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
     bool IsIdentityShuffle =
-        ((UseVecBaseAsInput ||
-          all_of(ExtractShuffles,
-                 [](const std::optional<TTI::ShuffleKind> &SK) {
-                   return SK.value_or(TTI::SK_PermuteTwoSrc) ==
-                          TTI::SK_PermuteSingleSrc;
-                 })) &&
+        (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+             TTI::SK_PermuteSingleSrc &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
         (!GatherShuffles.empty() &&
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 8f76b2e54e6c2d..e60e356e5cd819 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -75,47 +75,64 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP4TT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE45:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[VGETQ_LANE]], [[VGETQ_LANE45]]
+; CHECK-NEXT:    [[CONV48:%.*]] = trunc i64 [[ADD]] to i32
+; CHECK-NEXT:    [[VGETQ_LANE51:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE55:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD57:%.*]] = add i64 [[VGETQ_LANE51]], [[VGETQ_LANE55]]
+; CHECK-NEXT:    [[CONV60:%.*]] = trunc i64 [[ADD57]] to i32
+; CHECK-NEXT:    [[VGETQ_LANE63:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE67:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD69:%.*]] = add i64 [[VGETQ_LANE63]], [[VGETQ_LANE67]]
+; CHECK-NEXT:    [[CONV72:%.*]] = trunc i64 [[ADD69]] to i32
+; CHECK-NEXT:    [[VGETQ_LANE75:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE79:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD81:%.*]] = add i64 [[VGETQ_LANE75]], [[VGETQ_LANE79]]
+; CHECK-NEXT:    [[CONV84:%.*]] = trunc i64 [[ADD81]] to i32
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127
 ; CHECK-NEXT:    [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31
 ; CHECK-NEXT:    br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]]
 ; CHECK:       while.body88:
 ; CHECK-NEXT:    [[PA_ADDR_1291:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END121:%.*]] ], [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1290:%.*]] = phi ptr [ [[INCDEC_PTR89:%.*]], [[WHILE_END121]] ], [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CTT_0289:%.*]] = phi i32 [ [[ADD99:%.*]], [[WHILE_END121]] ], [ [[CONV48]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CFF_0288:%.*]] = phi i32 [ [[ADD106:%.*]], [[WHILE_END121]] ], [ [[CONV60]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CTF_0287:%.*]] = phi i32 [ [[ADD113:%.*]], [[WHILE_END121]] ], [ [[CONV72]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CFT_0286:%.*]] = phi i32 [ [[ADD120:%.*]], [[WHILE_END121]] ], [ [[CONV84]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1285:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END121]] ], [ [[AND]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP34:%.*]], [[WHILE_END121]] ], [ [[TMP17]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY93:%.*]]
 ; CHECK:       while.body93:
-; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP19]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP20]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CTT_1283:%.*]] = phi i32 [ [[_CTT_0289]], [[WHILE_BODY88]] ], [ [[ADD99]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CFF_1282:%.*]] = phi i32 [ [[_CFF_0288]], [[WHILE_BODY88]] ], [ [[ADD106]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CTF_1281:%.*]] = phi i32 [ [[_CTF_0287]], [[WHILE_BODY88]] ], [ [[ADD113]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CFT_1280:%.*]] = phi i32 [ [[_CFT_0286]], [[WHILE_BODY88]] ], [ [[ADD120]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP10]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP11]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[SHIFT_0277:%.*]] = phi i32 [ 0, [[WHILE_BODY88]] ], [ [[INC:%.*]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i32> [ [[TMP18]], [[WHILE_BODY88]] ], [ [[TMP34]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[AND94:%.*]] = and i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[AND95:%.*]] = and i32 [[B_0278]], 1
 ; CHECK-NEXT:    [[SHR96]] = lshr i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[SHR97]] = lshr i32 [[B_0278]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[AND94]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
-; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0
+; CHECK-NEXT:    [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL98]], i1 false
+; CHECK-NEXT:    [[LAND_EXT:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD99]] = add i32 [[_CTT_1283]], [[LAND_EXT]]
+; CHECK-NEXT:    [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0
+; CHECK-NEXT:    [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL103]], i1 false
+; CHECK-NEXT:    [[LAND_EXT105:%.*]] = zext i1 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD106]] = add i32 [[_CFF_1282]], [[LAND_EXT105]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL103]], i1 false
+; CHECK-NEXT:    [[LAND_EXT112:%.*]] = zext i1 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD113]] = add i32 [[_CTF_1281]], [[LAND_EXT112]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL98]], i1 false
+; CHECK-NEXT:    [[LAND_EXT119:%.*]] = zext i1 [[TMP15]] to i32
+; CHECK-NEXT:    [[ADD120]] = add i32 [[_CFT_1280]], [[LAND_EXT119]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[SHIFT_0277]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[WHILE_END121]], label [[WHILE_BODY93]]
@@ -127,53 +144,61 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    br i1 [[CMP86]], label [[WHILE_BODY88]], label [[WHILE_END122]]
 ; CHECK:       while.end122:
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1_LCSSA:%.*]] = phi i32 [ [[AND]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CFT_0_LCSSA:%.*]] = phi i32 [ [[CONV84]], [[WHILE_END]] ], [ [[ADD120]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CTF_0_LCSSA:%.*]] = phi i32 [ [[CONV72]], [[WHILE_END]] ], [ [[ADD113]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CFF_0_LCSSA:%.*]] = phi i32 [ [[CONV60]], [[WHILE_END]] ], [ [[ADD106]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CTT_0_LCSSA:%.*]] = phi i32 [ [[CONV48]], [[WHILE_END]] ], [ [[ADD99]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR89]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PA_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP17]], [[WHILE_END]] ], [ [[TMP34]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[CMP130_NOT299:%.*]] = icmp eq i32 [[NBBOOLBLOCK_1_LCSSA]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT299]], label [[WHILE_END166:%.*]], label [[WHILE_BODY132_PREHEADER:%.*]]
 ; CHECK:       while.body132.preheader:
-; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
 ; CHECK-NEXT:    [[SUB125:%.*]] = sub nuw nsw i32 32, [[NBBOOLBLOCK_1_LCSSA]]
-; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP36]], [[SUB125]]
-; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
-; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP37]], [[SUB125]]
+; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP16]], [[SUB125]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP17]], [[SUB125]]
 ; CHECK-NEXT:    br label [[WHILE_BODY132:%.*]]
 ; CHECK:       while.body132:
+; CHECK-NEXT:    [[_CTT_2306:%.*]] = phi i32 [ [[ADD142:%.*]], [[WHILE_BODY132]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[_CFF_2305:%.*]] = phi i32 [ [[ADD150:%.*]], [[WHILE_BODY132]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[_CTF_2304:%.*]] = phi i32 [ [[ADD157:%.*]], [[WHILE_BODY132]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[_CFT_2303:%.*]] = phi i32 [ [[ADD164:%.*]], [[WHILE_BODY132]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_2302:%.*]] = phi i32 [ [[DEC165:%.*]], [[WHILE_BODY132]] ], [ [[NBBOOLBLOCK_1_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[A_1301:%.*]] = phi i32 [ [[SHR135:%.*]], [[WHILE_BODY132]] ], [ [[SHR126]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[B_1300:%.*]] = phi i32 [ [[SHR136:%.*]], [[WHILE_BODY132]] ], [ [[SHR128]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP51:%.*]], [[WHILE_BODY132]] ], [ [[TMP35]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[AND133:%.*]] = and i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[AND134:%.*]] = and i32 [[B_1300]], 1
 ; CHECK-NEXT:    [[SHR135]] = lshr i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[SHR136]] = lshr i32 [[B_1300]], 1
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> poison, i32 [[AND133]], i32 0
-; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
-; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
+; CHECK-NEXT:    [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0
+; CHECK-NEXT:    [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL139]], i1 false
+; CHECK-NEXT:    [[LAND_EXT141:%.*]] = zext i1 [[TMP18]] to i32
+; CHECK-NEXT:    [[ADD142]] = add i32 [[_CTT_2306]], [[LAND_EXT141]]
+; CHECK-NEXT:    [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0
+; CHECK-NEXT:    [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL147]], i1 false
+; CHECK-NEXT:    [[LAND_EXT149:%.*]] = zext i1 [[TMP19]] to i32
+; CHECK-NEXT:    [[ADD150]] = add i32 [[_CFF_2305]], [[LAND_EXT149]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL147]], i1 false
+; CHECK-NEXT:    [[LAND_EXT156:%.*]] = zext i1 [[TMP20]] to i32
+; CHECK-NEXT:    [[ADD157]] = add i32 [[_CTF_2304]], [[LAND_EXT156]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL139]], i1 false
+; CHECK-NEXT:    [[LAND_EXT163:%.*]] = zext i1 [[TMP21]] to i32
+; CHECK-NEXT:    [[ADD164]] = add i32 [[_CFT_2303]], [[LAND_EXT163]]
 ; CHECK-NEXT:    [[DEC165]] = add nsw i32 [[NBBOOLBLOCK_2302]], -1
 ; CHECK-NEXT:    [[CMP130_NOT:%.*]] = icmp eq i32 [[DEC165]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
 ; CHECK:       while.end166:
-; CHECK-NEXT:    [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
-; CHECK-NEXT:    store i32 [[TMP53]], ptr [[CTT:%.*]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
-; CHECK-NEXT:    store i32 [[TMP54]], ptr [[CFF:%.*]], align 4
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1
-; CHECK-NEXT:    store i32 [[TMP55]], ptr [[CTF:%.*]], align 4
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0
-; CHECK-NEXT:    store i32 [[TMP56]], ptr [[CFT:%.*]], align 4
+; CHECK-NEXT:    [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD164]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD157]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD150]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD142]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
index c2369a6a89ec1d..de99654d84eb81 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; YAML-NEXT:  Function:        foo
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-4'
+; YAML-NEXT:    - Cost:            '-3'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '10'
 ; YAML-NEXT:  ...
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 4a9f717918a029..0217ddcac00468 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -166,31 +166,11 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[TMP7]]
-;
-; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[TMP7]]
-;
-; AVX-LABEL: @test_v4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[TMP3]]
+; CHECK-LABEL: @test_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -286,31 +266,11 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[TMP7]]
-;
-; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[TMP7]]
-;
-; AVX-LABEL: @test_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @test_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -348,31 +308,11 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -394,31 +334,11 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -456,31 +376,11 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; SLM-LABEL: @test_v16i16(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; AVX-LABEL: @test_v16i16(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+; CHECK-LABEL: @test_v16i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -548,3 +448,5 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index cac6845c43004f..c38d116a7a323c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -166,31 +166,11 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[TMP7]]
-;
-; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[TMP7]]
-;
-; AVX-LABEL: @test_v4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[TMP3]]
+; CHECK-LABEL: @test_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -286,31 +266,11 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[TMP7]]
-;
-; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[TMP7]]
-;
-; AVX-LABEL: @test_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @test_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -348,31 +308,11 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -394,31 +334,11 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -456,31 +376,11 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; SLM-LABEL: @test_v16i16(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; AVX-LABEL: @test_v16i16(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+; CHECK-LABEL: @test_v16i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -548,3 +448,5 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index 40b6a8c32f5d02..39400ba4ce1e83 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -145,31 +145,11 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[TMP7]]
-;
-; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[TMP7]]
-;
-; AVX-LABEL: @test_v4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[TMP3]]
+; CHECK-LABEL: @test_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -191,31 +171,11 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[TMP7]]
-;
-; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[TMP7]]
-;
-; AVX-LABEL: @test_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @test_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -253,31 +213,11 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -299,31 +239,11 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -361,31 +281,11 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; SLM-LABEL: @test_v16i16(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; AVX-LABEL: @test_v16i16(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+; CHECK-LABEL: @test_v16i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -454,6 +354,9 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
+; SLM: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 09113323d3ab77..6b63de83c56be2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -145,31 +145,11 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[TMP7]]
-;
-; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[TMP7]]
-;
-; AVX-LABEL: @test_v4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[TMP3]]
+; CHECK-LABEL: @test_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -191,31 +171,11 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[TMP7]]
-;
-; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[TMP7]]
-;
-; AVX-LABEL: @test_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @test_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -253,31 +213,11 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -299,31 +239,11 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -361,31 +281,11 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; SLM-LABEL: @test_v16i16(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; AVX-LABEL: @test_v16i16(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+; CHECK-LABEL: @test_v16i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -454,6 +354,9 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
+; SLM: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
index 94a1d7aa1951c4..35cb8c729e106f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
@@ -2,24 +2,23 @@
 ; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -pass-remarks-output=%t | FileCheck %s
 ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
 
-; YAML: --- !Missed
+; YAML: --- !Passed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            NotBeneficial
+; YAML-NEXT: Name:            VectorizedList
 ; YAML-NEXT: Function:        g
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
-; YAML-NEXT:   - Cost:            '0'
-; YAML-NEXT:   - String:          ' >= '
-; YAML-NEXT:   - Treshold:        '0'
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '-1'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '4'
 
 define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) {
 ; CHECK-LABEL: @g(
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[X1X1:%.*]] = mul i32 [[X1]], [[X1]]
-; CHECK-NEXT:    [[AB:%.*]] = mul i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i32> poison, i32 [[X1X1]], i32 0
-; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i32> [[INS1]], i32 [[AB]], i32 1
-; CHECK-NEXT:    ret <2 x i32> [[INS2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
 ;
   %x1 = extractelement <2 x i32> %x, i32 1
   %x1x1 = mul i32 %x1, %x1

From edea9743490ec01dca2d70f343f1544e7ead4915 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Nov 2023 14:15:24 +0100
Subject: [PATCH 09/40] [ValueTracking] Avoid FP cast constant expressions

Use the constant folding API instead.
---
 llvm/lib/Analysis/ValueTracking.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 2458c1cb9f8ec1..ff14da4c1e73e9 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7880,22 +7880,22 @@ static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
     }
     break;
   case Instruction::FPTrunc:
-    CastedTo = ConstantExpr::getFPExtend(C, SrcTy, true);
+    CastedTo = ConstantFoldCastOperand(Instruction::FPExt, C, SrcTy, DL);
     break;
   case Instruction::FPExt:
-    CastedTo = ConstantExpr::getFPTrunc(C, SrcTy, true);
+    CastedTo = ConstantFoldCastOperand(Instruction::FPTrunc, C, SrcTy, DL);
     break;
   case Instruction::FPToUI:
-    CastedTo = ConstantExpr::getUIToFP(C, SrcTy, true);
+    CastedTo = ConstantFoldCastOperand(Instruction::UIToFP, C, SrcTy, DL);
     break;
   case Instruction::FPToSI:
-    CastedTo = ConstantExpr::getSIToFP(C, SrcTy, true);
+    CastedTo = ConstantFoldCastOperand(Instruction::SIToFP, C, SrcTy, DL);
     break;
   case Instruction::UIToFP:
-    CastedTo = ConstantExpr::getFPToUI(C, SrcTy, true);
+    CastedTo = ConstantFoldCastOperand(Instruction::FPToUI, C, SrcTy, DL);
     break;
   case Instruction::SIToFP:
-    CastedTo = ConstantExpr::getFPToSI(C, SrcTy, true);
+    CastedTo = ConstantFoldCastOperand(Instruction::FPToSI, C, SrcTy, DL);
     break;
   default:
     break;

From 0a22a80c1b83996a4424c94a3597d8f974ecb444 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Mon, 6 Nov 2023 16:22:56 +0300
Subject: [PATCH 10/40] [mlir][vector] Fix extractelement/insertelement folder
 crash on poison attr (#71333)

Types of incoming attributes weren't properly checked.
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp   | 26 +++++----
 mlir/test/Dialect/Vector/canonicalize.mlir | 66 +++++++++++++++++++++-
 2 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 60416f550ee619..69cbdcd3f536f9 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -1188,9 +1188,6 @@ OpFoldResult vector::ExtractElementOp::fold(FoldAdaptor adaptor) {
   if (!adaptor.getPosition())
     return {};
 
-  Attribute src = adaptor.getVector();
-  Attribute pos = adaptor.getPosition();
-
   // Fold extractelement (splat X) -> X.
   if (auto splat = getVector().getDefiningOp<vector::SplatOp>())
     return splat.getInput();
@@ -1200,13 +1197,16 @@ OpFoldResult vector::ExtractElementOp::fold(FoldAdaptor adaptor) {
     if (!llvm::isa<VectorType>(broadcast.getSource().getType()))
       return broadcast.getSource();
 
+  auto src = dyn_cast_or_null<DenseElementsAttr>(adaptor.getVector());
+  auto pos = dyn_cast_or_null<IntegerAttr>(adaptor.getPosition());
   if (!pos || !src)
     return {};
 
-  auto srcElements = llvm::cast<DenseElementsAttr>(src).getValues<Attribute>();
+  auto srcElements = src.getValues<Attribute>();
 
-  auto attr = llvm::dyn_cast<IntegerAttr>(pos);
-  uint64_t posIdx = attr.getInt();
+  uint64_t posIdx = pos.getInt();
+  if (posIdx >= srcElements.size())
+    return {};
 
   return srcElements[posIdx];
 }
@@ -2511,18 +2511,20 @@ OpFoldResult vector::InsertElementOp::fold(FoldAdaptor adaptor) {
   if (!adaptor.getPosition())
     return {};
 
-  Attribute src = adaptor.getSource();
-  Attribute dst = adaptor.getDest();
-  Attribute pos = adaptor.getPosition();
+  auto src = dyn_cast_or_null<TypedAttr>(adaptor.getSource());
+  auto dst = dyn_cast_or_null<DenseElementsAttr>(adaptor.getDest());
+  auto pos = dyn_cast_or_null<IntegerAttr>(adaptor.getPosition());
   if (!src || !dst || !pos)
     return {};
 
-  auto dstElements = llvm::cast<DenseElementsAttr>(dst).getValues<Attribute>();
+  if (src.getType() != getDestVectorType().getElementType())
+    return {};
+
+  auto dstElements = dst.getValues<Attribute>();
 
   SmallVector<Attribute> results(dstElements);
 
-  auto attr = llvm::dyn_cast<IntegerAttr>(pos);
-  uint64_t posIdx = attr.getInt();
+  uint64_t posIdx = pos.getInt();
   if (posIdx >= results.size())
     return {};
   results[posIdx] = src;
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index f6bb42b1b24915..163fdd67b0cfd3 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2027,6 +2027,46 @@ func.func @insert_element_invalid_fold() -> vector<1xf32> {
   return %46 : vector<1xf32>
 }
 
+
+// -----
+
+// Do not crash on poison
+// CHECK-LABEL: func @insert_poison_fold1
+//       CHECK:   vector.insertelement
+func.func @insert_poison_fold1() -> vector<4xi32> {
+  %v = ub.poison : vector<4xi32>
+  %s = arith.constant 7 : i32
+  %i = arith.constant 2 : i32
+  %1 = vector.insertelement %s, %v[%i : i32] : vector<4xi32>
+  return %1 : vector<4xi32>
+}
+
+// -----
+
+// Do not crash on poison
+// CHECK-LABEL: func @insert_poison_fold2
+//       CHECK:   vector.insertelement
+func.func @insert_poison_fold2() -> vector<4xi32> {
+  %v = arith.constant dense<[0, 1, 2, 3]> : vector<4xi32>
+  %s = ub.poison : i32
+  %i = arith.constant 2 : i32
+  %1 = vector.insertelement %s, %v[%i : i32] : vector<4xi32>
+  return %1 : vector<4xi32>
+}
+
+// -----
+
+// Do not crash on poison
+// CHECK-LABEL: func @insert_poison_fold3
+//       CHECK:   vector.insertelement
+func.func @insert_poison_fold3() -> vector<4xi32> {
+  %v = arith.constant dense<[0, 1, 2, 3]> : vector<4xi32>
+  %s = arith.constant 7 : i32
+  %i = ub.poison : i32
+  %1 = vector.insertelement %s, %v[%i : i32] : vector<4xi32>
+  return %1 : vector<4xi32>
+}
+
 // -----
 
 // CHECK-LABEL: func @extract_element_fold
@@ -2051,6 +2091,30 @@ func.func @extract_element_splat_fold(%a : i32) -> i32 {
 
 // -----
 
+// Do not crash on poison
+// CHECK-LABEL: func @extract_element_poison_fold1
+//       CHECK:   vector.extractelement
+func.func @extract_element_poison_fold1() -> i32 {
+  %v = ub.poison : vector<4xi32>
+  %i = arith.constant 2 : i32
+  %1 = vector.extractelement %v[%i : i32] : vector<4xi32>
+  return %1 : i32
+}
+
+// -----
+
+// Do not crash on poison
+// CHECK-LABEL: func @extract_element_poison_fold2
+//       CHECK:   vector.extractelement
+func.func @extract_element_poison_fold2() -> i32 {
+  %v = arith.constant dense<[1, 3, 5, 7]> : vector<4xi32>
+  %i = ub.poison : i32
+  %1 = vector.extractelement %v[%i : i32] : vector<4xi32>
+  return %1 : i32
+}
+
+// -----
+
 // CHECK-LABEL: func @reduce_one_element_vector_extract
 //  CHECK-SAME: (%[[V:.+]]: vector<1xf32>)
 //       CHECK:   %[[S:.+]] = vector.extract %[[V]][0] : f32 from vector<1xf32>
@@ -2436,4 +2500,4 @@ func.func @load_store_forwarding_rank_mismatch(%v0: vector<4x1x1xf32>, %arg0: te
       permutation_map = affine_map<(d0, d1, d2) -> (d1, 0, d2, 0)>} :
       tensor<4x4x4xf32>, vector<1x100x4x5xf32>
   return %r : vector<1x100x4x5xf32>
-}
\ No newline at end of file
+}

From fbaf2c6cf7b207145dbda0d1cbadd0b446a21199 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee@amd.com>
Date: Fri, 22 Sep 2023 18:12:06 +0100
Subject: [PATCH 11/40] [OpenMP][Flang] Add "IsolatedFromAbove" trait to
 omp.target

This patch adds the PFT lowering changes required for adding the IsolatedFromAbove trait to omp.target.

Key Changes:
	- Add IsolatedFromAbove trait to target op in MLIR.
	- Main reason for this change is to prevent CSE and other similar optimisations from crossing region boundaries for target operations. The link below has the discourse discussion surrounding this issue.
	- Move implicit operand capturing to the PFT lowering stage.
	- Implicit operands are first added as implicitly captured map_operands with their map_types set accordingly to indicate this. Later, all map_operands including implicit ones are added as block arguments.
	- Remove `implicit` attribute from the `MapInfoOp`. This information is already captured by the `map_type`.
	- The custom printer and parser for the map_types have been updated to show the `implicit` and `literal` map_types.
	- Update related tests.
	- This fixes #63555.
	- This fixes #70488.
---
 flang/lib/Lower/OpenMP.cpp                    | 298 +++++++++++++++---
 .../Driver/OpenMP/map-types-and-sizes.f90     |  52 +++
 .../Fir/convert-to-llvm-openmp-and-fir.fir    |  10 +-
 flang/test/Lower/OpenMP/FIR/array-bounds.f90  | 119 +++----
 flang/test/Lower/OpenMP/FIR/location.f90      |   2 +-
 flang/test/Lower/OpenMP/FIR/target.f90        | 106 +++++--
 flang/test/Lower/OpenMP/array-bounds.f90      |   6 +-
 flang/test/Lower/OpenMP/location.f90          |   2 +-
 flang/test/Lower/OpenMP/target.f90            | 144 ++++++++-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  33 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |  73 +++++
 11 files changed, 676 insertions(+), 169 deletions(-)
 create mode 100644 flang/test/Driver/OpenMP/map-types-and-sizes.f90

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 4a73ee87579c71..390bafb49e5124 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -560,11 +560,21 @@ class ClauseProcessor {
             mlir::Value &result) const;
   bool
   processLink(llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const;
+
+  // This method is used to process a map clause.
+  // The optional parameters - mapSymTypes, mapSymLocs & mapSymbols are used to
+  // store the original type, location and Fortran symbol for the map operands.
+  // They may be used later on to create the block_arguments for some of the
+  // target directives that require it.
   bool processMap(mlir::Location currentLocation,
                   const llvm::omp::Directive &directive,
                   Fortran::semantics::SemanticsContext &semanticsContext,
                   Fortran::lower::StatementContext &stmtCtx,
-                  llvm::SmallVectorImpl<mlir::Value> &mapOperands) const;
+                  llvm::SmallVectorImpl<mlir::Value> &mapOperands,
+                  llvm::SmallVectorImpl<mlir::Type> *mapSymTypes = nullptr,
+                  llvm::SmallVectorImpl<mlir::Location> *mapSymLocs = nullptr,
+                  llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
+                      *mapSymbols = nullptr) const;
   bool processReduction(
       mlir::Location currentLocation,
       llvm::SmallVectorImpl<mlir::Value> &reductionVars,
@@ -1691,31 +1701,29 @@ static mlir::omp::MapInfoOp
 createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 mlir::Value baseAddr, std::stringstream &name,
                 mlir::SmallVector<mlir::Value> bounds, uint64_t mapType,
-                mlir::omp::VariableCaptureKind mapCaptureType, bool implicit,
-                mlir::Type retTy) {
-  mlir::Value varPtrPtr;
+                mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
+                bool isVal = false) {
+  mlir::Value val, varPtr, varPtrPtr;
+  mlir::TypeAttr varType;
+
   if (auto boxTy = baseAddr.getType().dyn_cast<fir::BaseBoxType>()) {
     baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
     retTy = baseAddr.getType();
   }
 
-  mlir::omp::MapInfoOp op =
-      builder.create<mlir::omp::MapInfoOp>(loc, retTy, baseAddr);
-  op.setVarTypeAttr(mlir::TypeAttr::get(
-      llvm::dyn_cast<mlir::omp::PointerLikeType>(retTy).getElementType()));
-  op.setNameAttr(builder.getStringAttr(name.str()));
-  op.setImplicit(implicit);
-  op.setMapType(mapType);
-  op.setMapCaptureType(mapCaptureType);
-
-  unsigned insPos = 1;
-  if (varPtrPtr)
-    op->insertOperands(insPos++, varPtrPtr);
-  if (bounds.size() > 0)
-    op->insertOperands(insPos, bounds);
-  op->setAttr(mlir::omp::MapInfoOp::getOperandSegmentSizeAttr(),
-              builder.getDenseI32ArrayAttr(
-                  {1, varPtrPtr ? 1 : 0, static_cast<int32_t>(bounds.size())}));
+  if (isVal)
+    val = baseAddr;
+  else
+    varPtr = baseAddr;
+
+  if (auto ptrType = llvm::dyn_cast<mlir::omp::PointerLikeType>(retTy))
+    varType = mlir::TypeAttr::get(ptrType.getElementType());
+
+  mlir::omp::MapInfoOp op = builder.create<mlir::omp::MapInfoOp>(
+      loc, retTy, val, varPtr, varType, varPtrPtr, bounds,
+      builder.getIntegerAttr(builder.getIntegerType(64, false), mapType),
+      builder.getAttr<mlir::omp::VariableCaptureKindAttr>(mapCaptureType),
+      builder.getStringAttr(name.str()));
   return op;
 }
 
@@ -1723,7 +1731,11 @@ bool ClauseProcessor::processMap(
     mlir::Location currentLocation, const llvm::omp::Directive &directive,
     Fortran::semantics::SemanticsContext &semanticsContext,
     Fortran::lower::StatementContext &stmtCtx,
-    llvm::SmallVectorImpl<mlir::Value> &mapOperands) const {
+    llvm::SmallVectorImpl<mlir::Value> &mapOperands,
+    llvm::SmallVectorImpl<mlir::Type> *mapSymTypes,
+    llvm::SmallVectorImpl<mlir::Location> *mapSymLocs,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *mapSymbols)
+    const {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   return findRepeatableClause<ClauseTy::Map>(
       [&](const ClauseTy::Map *mapClause,
@@ -1783,13 +1795,20 @@ bool ClauseProcessor::processMap(
           // Explicit map captures are captured ByRef by default,
           // optimisation passes may alter this to ByCopy or other capture
           // types to optimise
-          mapOperands.push_back(createMapInfoOp(
+          mlir::Value mapOp = createMapInfoOp(
               firOpBuilder, clauseLocation, baseAddr, asFortran, bounds,
               static_cast<
                   std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
                   mapTypeBits),
-              mlir::omp::VariableCaptureKind::ByRef, false,
-              baseAddr.getType()));
+              mlir::omp::VariableCaptureKind::ByRef, baseAddr.getType());
+
+          mapOperands.push_back(mapOp);
+          if (mapSymTypes)
+            mapSymTypes->push_back(baseAddr.getType());
+          if (mapSymLocs)
+            mapSymLocs->push_back(baseAddr.getLoc());
+          if (mapSymbols)
+            mapSymbols->push_back(getOmpObjectSymbol(ompObject));
         }
       });
 }
@@ -2172,7 +2191,7 @@ static void createBodyOfOp(
   }
 }
 
-static void createBodyOfTargetDataOp(
+static void genBodyOfTargetDataOp(
     Fortran::lower::AbstractConverter &converter,
     Fortran::lower::pft::Evaluation &eval, mlir::omp::DataOp &dataOp,
     const llvm::SmallVector<mlir::Type> &useDeviceTypes,
@@ -2188,18 +2207,17 @@ static void createBodyOfTargetDataOp(
   unsigned argIndex = 0;
   for (const Fortran::semantics::Symbol *sym : useDeviceSymbols) {
     const mlir::BlockArgument &arg = region.front().getArgument(argIndex);
-    mlir::Value val = fir::getBase(arg);
     fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym);
-    if (auto refType = val.getType().dyn_cast<fir::ReferenceType>()) {
+    if (auto refType = arg.getType().dyn_cast<fir::ReferenceType>()) {
       if (fir::isa_builtin_cptr_type(refType.getElementType())) {
-        converter.bindSymbol(*sym, val);
+        converter.bindSymbol(*sym, arg);
       } else {
         extVal.match(
             [&](const fir::MutableBoxValue &mbv) {
               converter.bindSymbol(
                   *sym,
                   fir::MutableBoxValue(
-                      val, fir::factory::getNonDeferredLenParams(extVal), {}));
+                      arg, fir::factory::getNonDeferredLenParams(extVal), {}));
             },
             [&](const auto &) {
               TODO(converter.getCurrentLocation(),
@@ -2407,8 +2425,8 @@ genDataOp(Fortran::lower::AbstractConverter &converter,
   auto dataOp = converter.getFirOpBuilder().create<mlir::omp::DataOp>(
       currentLocation, ifClauseOperand, deviceOperand, devicePtrOperands,
       deviceAddrOperands, mapOperands);
-  createBodyOfTargetDataOp(converter, eval, dataOp, useDeviceTypes,
-                           useDeviceLocs, useDeviceSymbols, currentLocation);
+  genBodyOfTargetDataOp(converter, eval, dataOp, useDeviceTypes, useDeviceLocs,
+                        useDeviceSymbols, currentLocation);
   return dataOp;
 }
 
@@ -2451,6 +2469,101 @@ genEnterExitDataOp(Fortran::lower::AbstractConverter &converter,
                                    deviceOperand, nowaitAttr, mapOperands);
 }
 
+// This functions creates a block for the body of the targetOp's region. It adds
+// all the symbols present in mapSymbols as block arguments to this block.
+static void genBodyOfTargetOp(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::pft::Evaluation &eval, mlir::omp::TargetOp &targetOp,
+    const llvm::SmallVector<mlir::Type> &mapSymTypes,
+    const llvm::SmallVector<mlir::Location> &mapSymLocs,
+    const llvm::SmallVector<const Fortran::semantics::Symbol *> &mapSymbols,
+    const mlir::Location &currentLocation) {
+  assert(mapSymTypes.size() == mapSymLocs.size());
+
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  mlir::Region &region = targetOp.getRegion();
+
+  firOpBuilder.createBlock(&region, {}, mapSymTypes, mapSymLocs);
+
+  unsigned argIndex = 0;
+  unsigned blockArgsIndex = mapSymbols.size();
+
+  // The block arguments contain the map_operands followed by the bounds in
+  // order. This returns a vector containing the next 'n' block arguments for
+  // the bounds.
+  auto extractBoundArgs = [&](auto n) {
+    llvm::SmallVector<mlir::Value> argExtents;
+    while (n--) {
+      argExtents.push_back(fir::getBase(region.getArgument(blockArgsIndex)));
+      blockArgsIndex++;
+    }
+    return argExtents;
+  };
+
+  // Bind the symbols to their corresponding block arguments.
+  for (const Fortran::semantics::Symbol *sym : mapSymbols) {
+    const mlir::BlockArgument &arg = region.getArgument(argIndex);
+    fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym);
+    extVal.match(
+        [&](const fir::BoxValue &v) {
+          converter.bindSymbol(
+              *sym,
+              fir::BoxValue(arg, extractBoundArgs(v.getLBounds().size()),
+                            v.getExplicitParameters(), v.getExplicitExtents()));
+        },
+        [&](const fir::MutableBoxValue &v) {
+          converter.bindSymbol(
+              *sym,
+              fir::MutableBoxValue(arg, extractBoundArgs(v.getLBounds().size()),
+                                   v.getMutableProperties()));
+        },
+        [&](const fir::ArrayBoxValue &v) {
+          converter.bindSymbol(
+              *sym,
+              fir::ArrayBoxValue(arg, extractBoundArgs(v.getExtents().size()),
+                                 extractBoundArgs(v.getLBounds().size()),
+                                 v.getSourceBox()));
+        },
+        [&](const fir::CharArrayBoxValue &v) {
+          converter.bindSymbol(
+              *sym,
+              fir::CharArrayBoxValue(arg, v.getLen(),
+                                     extractBoundArgs(v.getExtents().size()),
+                                     extractBoundArgs(v.getLBounds().size())));
+        },
+        [&](const fir::CharBoxValue &v) {
+          converter.bindSymbol(*sym, fir::CharBoxValue(arg, v.getLen()));
+        },
+        [&](const fir::UnboxedValue &v) { converter.bindSymbol(*sym, arg); },
+        [&](const auto &) {
+          TODO(converter.getCurrentLocation(),
+               "target map clause operand unsupported type");
+        });
+    argIndex++;
+  }
+
+  // Insert dummy instruction to remember the insertion position. The
+  // marker will be deleted since there are not uses.
+  // In the HLFIR flow there are hlfir.declares inserted above while
+  // setting block arguments.
+  mlir::Value undefMarker = firOpBuilder.create<fir::UndefOp>(
+      targetOp.getOperation()->getLoc(), firOpBuilder.getIndexType());
+
+  // Create blocks for unstructured regions. This has to be done since
+  // blocks are initially allocated with the function as the parent region.
+  // the parent region of blocks.
+  if (eval.lowerAsUnstructured()) {
+    Fortran::lower::createEmptyRegionBlocks<mlir::omp::TerminatorOp,
+                                            mlir::omp::YieldOp>(
+        firOpBuilder, eval.getNestedEvaluations());
+  }
+
+  firOpBuilder.create<mlir::omp::TerminatorOp>(currentLocation);
+
+  // Create the insertion point after the marker.
+  firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp());
+}
+
 static mlir::omp::TargetOp
 genTargetOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::pft::Evaluation &eval,
@@ -2462,6 +2575,9 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
   mlir::Value ifClauseOperand, deviceOperand, threadLimitOperand;
   mlir::UnitAttr nowaitAttr;
   llvm::SmallVector<mlir::Value> mapOperands;
+  llvm::SmallVector<mlir::Type> mapSymTypes;
+  llvm::SmallVector<mlir::Location> mapSymLocs;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> mapSymbols;
 
   ClauseProcessor cp(converter, clauseList);
   cp.processIf(stmtCtx,
@@ -2471,7 +2587,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
   cp.processThreadLimit(stmtCtx, threadLimitOperand);
   cp.processNowait(nowaitAttr);
   cp.processMap(currentLocation, directive, semanticsContext, stmtCtx,
-                mapOperands);
+                mapOperands, &mapSymTypes, &mapSymLocs, &mapSymbols);
   cp.processTODO<Fortran::parser::OmpClause::Private,
                  Fortran::parser::OmpClause::Depend,
                  Fortran::parser::OmpClause::Firstprivate,
@@ -2484,10 +2600,118 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
                  Fortran::parser::OmpClause::Defaultmap>(
       currentLocation, llvm::omp::Directive::OMPD_target);
 
-  return genOpWithBody<mlir::omp::TargetOp>(
-      converter, eval, currentLocation, outerCombined, &clauseList,
-      ifClauseOperand, deviceOperand, threadLimitOperand, nowaitAttr,
-      mapOperands);
+  // 5.8.1 Implicit Data-Mapping Attribute Rules
+  // The following code follows the implicit data-mapping rules to map all the
+  // symbols used inside the region that have not been explicitly mapped using
+  // the map clause.
+  auto captureImplicitMap = [&](const Fortran::semantics::Symbol &sym) {
+    if (llvm::find(mapSymbols, &sym) == mapSymbols.end()) {
+      mlir::Value baseOp = converter.getSymbolAddress(sym);
+      if (!baseOp)
+        if (const auto *details = sym.template detailsIf<
+                                  Fortran::semantics::HostAssocDetails>()) {
+          baseOp = converter.getSymbolAddress(details->symbol());
+          converter.copySymbolBinding(details->symbol(), sym);
+        }
+
+      if (baseOp) {
+        llvm::SmallVector<mlir::Value> bounds;
+        std::stringstream name;
+        fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(sym);
+        name << sym.name().ToString();
+
+        mlir::Value baseAddr =
+            getDataOperandBaseAddr(converter, converter.getFirOpBuilder(), sym,
+                                   converter.getCurrentLocation());
+        if (fir::unwrapRefType(baseAddr.getType()).isa<fir::BaseBoxType>())
+          bounds =
+              Fortran::lower::genBoundsOpsFromBox<mlir::omp::DataBoundsOp,
+                                                  mlir::omp::DataBoundsType>(
+                  converter.getFirOpBuilder(), converter.getCurrentLocation(),
+                  converter, dataExv, baseAddr);
+        if (fir::unwrapRefType(baseAddr.getType()).isa<fir::SequenceType>())
+          bounds = Fortran::lower::genBaseBoundsOps<mlir::omp::DataBoundsOp,
+                                                    mlir::omp::DataBoundsType>(
+              converter.getFirOpBuilder(), converter.getCurrentLocation(),
+              converter, dataExv, baseAddr);
+
+        llvm::omp::OpenMPOffloadMappingFlags mapFlag =
+            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+        mlir::omp::VariableCaptureKind captureKind =
+            mlir::omp::VariableCaptureKind::ByRef;
+        if (auto refType = baseOp.getType().dyn_cast<fir::ReferenceType>()) {
+          auto eleType = refType.getElementType();
+          if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
+            captureKind = mlir::omp::VariableCaptureKind::ByCopy;
+          } else if (!fir::isa_builtin_cptr_type(eleType)) {
+            mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+            mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+          }
+        }
+
+        mlir::Value mapOp = createMapInfoOp(
+            converter.getFirOpBuilder(), baseOp.getLoc(), baseOp, name, bounds,
+            static_cast<
+                std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
+                mapFlag),
+            captureKind, baseOp.getType());
+
+        mapOperands.push_back(mapOp);
+        mapSymTypes.push_back(baseOp.getType());
+        mapSymLocs.push_back(baseOp.getLoc());
+        mapSymbols.push_back(&sym);
+      }
+    }
+  };
+  Fortran::lower::pft::visitAllSymbols(eval, captureImplicitMap);
+
+  // Add the bounds and extents for box values to mapOperands
+  auto addMapInfoForBounds = [&](const auto &bounds) {
+    for (auto &val : bounds) {
+      mapSymLocs.push_back(val.getLoc());
+      mapSymTypes.push_back(val.getType());
+
+      llvm::SmallVector<mlir::Value> bounds;
+      std::stringstream name;
+
+      mlir::Value mapOp = createMapInfoOp(
+          converter.getFirOpBuilder(), val.getLoc(), val, name, bounds,
+          static_cast<
+              std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
+              llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
+          mlir::omp::VariableCaptureKind::ByCopy, val.getType(), true);
+      mapOperands.push_back(mapOp);
+    }
+  };
+
+  for (const Fortran::semantics::Symbol *sym : mapSymbols) {
+    fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym);
+    extVal.match(
+        [&](const fir::BoxValue &v) { addMapInfoForBounds(v.getLBounds()); },
+        [&](const fir::MutableBoxValue &v) {
+          addMapInfoForBounds(v.getLBounds());
+        },
+        [&](const fir::ArrayBoxValue &v) {
+          addMapInfoForBounds(v.getExtents());
+          addMapInfoForBounds(v.getLBounds());
+        },
+        [&](const fir::CharArrayBoxValue &v) {
+          addMapInfoForBounds(v.getExtents());
+          addMapInfoForBounds(v.getLBounds());
+        },
+        [&](const auto &) {
+          // Nothing to do for non-box values.
+        });
+  }
+
+  auto targetOp = converter.getFirOpBuilder().create<mlir::omp::TargetOp>(
+      currentLocation, ifClauseOperand, deviceOperand, threadLimitOperand,
+      nowaitAttr, mapOperands);
+
+  genBodyOfTargetOp(converter, eval, targetOp, mapSymTypes, mapSymLocs,
+                    mapSymbols, currentLocation);
+
+  return targetOp;
 }
 
 static mlir::omp::TeamsOp
diff --git a/flang/test/Driver/OpenMP/map-types-and-sizes.f90 b/flang/test/Driver/OpenMP/map-types-and-sizes.f90
new file mode 100644
index 00000000000000..e4f429e479af15
--- /dev/null
+++ b/flang/test/Driver/OpenMP/map-types-and-sizes.f90
@@ -0,0 +1,52 @@
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s
+
+!===============================================================================
+! Check MapTypes for target implicit captures
+!===============================================================================
+
+!CHECK: @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
+!CHECK: @.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 800]
+subroutine mapType_scalar
+  integer :: a
+  !$omp target
+     a = 10
+  !$omp end target
+end subroutine mapType_scalar
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 4096]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 547]
+subroutine mapType_array
+  integer :: a(1024)
+  !$omp target
+     a(10) = 20
+  !$omp end target
+end subroutine mapType_array
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 8]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 547]
+subroutine mapType_ptr
+  integer, pointer :: a
+  !$omp target
+     a = 10
+  !$omp end target
+end subroutine mapType_ptr
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [2 x i64] [i64 8, i64 4]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
+subroutine mapType_c_ptr
+  use iso_c_binding, only : c_ptr, c_loc
+  type(c_ptr) :: a
+  integer, target :: b
+  !$omp target
+     a = c_loc(b)
+  !$omp end target
+end subroutine mapType_c_ptr
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 1]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 800]
+subroutine mapType_char
+  character :: a
+  !$omp target
+     a = 'b'
+  !$omp end target
+end subroutine mapType_char
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index ecfb8e52cae783..6efa4d0a095869 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -433,12 +433,13 @@ func.func @_QPomp_target() {
   %1 = arith.subi %c512, %c1 : index
   %2 = omp.bounds   lower_bound(%c0 : index) upper_bound(%1 : index) extent(%c512 : index) stride(%c1 : index) start_idx(%c1 : index)
   %3 = omp.map_info var_ptr(%0 : !fir.ref<!fir.array<512xi32>>, !fir.array<512xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%2) -> !fir.ref<!fir.array<512xi32>> {name = "a"}
-  omp.target   thread_limit(%c64_i32 : i32) map_entries(%3 : !fir.ref<!fir.array<512xi32>>) {
+  omp.target   thread_limit(%c64_i32 : i32) map_entries(%3 -> %arg0 : !fir.ref<!fir.array<512xi32>>) {
+    ^bb0(%arg0: !fir.ref<!fir.array<512xi32>>):
     %c10_i32 = arith.constant 10 : i32
     %c1_i64 = arith.constant 1 : i64
     %c1_i64_0 = arith.constant 1 : i64
     %4 = arith.subi %c1_i64, %c1_i64_0 : i64
-    %5 = fir.coordinate_of %0, %4 : (!fir.ref<!fir.array<512xi32>>, i64) -> !fir.ref<i32>
+    %5 = fir.coordinate_of %arg0, %4 : (!fir.ref<!fir.array<512xi32>>, i64) -> !fir.ref<i32>
     fir.store %c10_i32 to %5 : !fir.ref<i32>
     omp.terminator
   }
@@ -455,12 +456,13 @@ func.func @_QPomp_target() {
 // CHECK:           %[[UPPER:.*]] = llvm.mlir.constant(511 : index) : i64
 // CHECK:           %[[BOUNDS:.*]] = omp.bounds   lower_bound(%[[LOWER]] : i64) upper_bound(%[[UPPER]] : i64) extent(%[[EXTENT]] : i64) stride(%[[STRIDE]] : i64) start_idx(%[[STRIDE]] : i64)
 // CHECK:           %[[MAP:.*]] = omp.map_info var_ptr(%2 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !llvm.ptr {name = "a"}
-// CHECK:           omp.target   thread_limit(%[[VAL_2]] : i32) map_entries(%[[MAP]] : !llvm.ptr) {
+// CHECK:           omp.target   thread_limit(%[[VAL_2]] : i32) map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !llvm.ptr) {
+// CHECK:           ^bb0(%[[ARG_0]]: !llvm.ptr):
 // CHECK:             %[[VAL_3:.*]] = llvm.mlir.constant(10 : i32) : i32
 // CHECK:             %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:             %[[VAL_5:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:             %[[VAL_6:.*]] = llvm.mlir.constant(0 : i64) : i64
-// CHECK:             %[[VAL_7:.*]] = llvm.getelementptr %[[VAL_1]][0, %[[VAL_6]]] : (!llvm.ptr, i64) -> !llvm.ptr
+// CHECK:             %[[VAL_7:.*]] = llvm.getelementptr %[[ARG_0]][0, %[[VAL_6]]] : (!llvm.ptr, i64) -> !llvm.ptr
 // CHECK:             llvm.store %[[VAL_3]], %[[VAL_7]] : i32, !llvm.ptr
 // CHECK:             omp.terminator
 // CHECK:           }
diff --git a/flang/test/Lower/OpenMP/FIR/array-bounds.f90 b/flang/test/Lower/OpenMP/FIR/array-bounds.f90
index a17e4b9dfad36d..abef31af22ba66 100644
--- a/flang/test/Lower/OpenMP/FIR/array-bounds.f90
+++ b/flang/test/Lower/OpenMP/FIR/array-bounds.f90
@@ -1,37 +1,22 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes HOST
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefixes DEVICE
+!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes=HOST,ALL
+!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefixes=DEVICE,ALL
 
-!DEVICE-LABEL: func.func @_QPread_write_section_omp_outline_0(
-!DEVICE-SAME: %[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<!fir.array<10xi32>>, %[[ARG2:.*]]: !fir.ref<!fir.array<10xi32>>) attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, omp.outline_parent_name = "_QPread_write_section"} {
-!DEVICE:  %[[C1:.*]] = arith.constant 1 : index
-!DEVICE:  %[[C2:.*]] = arith.constant 4 : index
-!DEVICE:  %[[C3:.*]] = arith.constant 1 : index
-!DEVICE:  %[[C4:.*]] = arith.constant 1 : index
-!DEVICE:  %[[BOUNDS0:.*]] = omp.bounds   lower_bound(%[[C1]] : index) upper_bound(%[[C2]] : index) stride(%[[C4]] : index) start_idx(%[[C4]] : index)
-!DEVICE:  %[[MAP0:.*]] = omp.map_info var_ptr(%[[ARG1]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_read(2:5)"}
-!DEVICE:  %[[C5:.*]] = arith.constant 1 : index
-!DEVICE:  %[[C6:.*]] = arith.constant 4 : index
-!DEVICE:  %[[C7:.*]] = arith.constant 1 : index
-!DEVICE:  %[[C8:.*]] = arith.constant 1 : index
-!DEVICE:  %[[BOUNDS1:.*]] = omp.bounds   lower_bound(%[[C5]] : index) upper_bound(%[[C6]] : index) stride(%[[C8]] : index) start_idx(%[[C8]] : index)
-!DEVICE:  %[[MAP1:.*]] = omp.map_info var_ptr(%[[ARG2]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_write(2:5)"}
-!DEVICE:  omp.target   map_entries(%[[MAP0]], %[[MAP1]] : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) {
-
-!HOST-LABEL:  func.func @_QPread_write_section() {
-!HOST:  %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFread_write_sectionEi"}
-!HOST:  %[[READ:.*]] = fir.address_of(@_QFread_write_sectionEsp_read) : !fir.ref<!fir.array<10xi32>>
-!HOST:  %[[WRITE:.*]] = fir.address_of(@_QFread_write_sectionEsp_write) : !fir.ref<!fir.array<10xi32>>
-!HOST:  %[[C1:.*]] = arith.constant 1 : index
-!HOST:  %[[C2:.*]] = arith.constant 1 : index
-!HOST:  %[[C3:.*]] = arith.constant 4 : index
-!HOST:  %[[BOUNDS0:.*]] = omp.bounds   lower_bound(%[[C2]] : index) upper_bound(%[[C3]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
-!HOST:  %[[MAP0:.*]] = omp.map_info var_ptr(%[[READ]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_read(2:5)"}
-!HOST:  %[[C4:.*]] = arith.constant 1 : index
-!HOST:  %[[C5:.*]] = arith.constant 1 : index
-!HOST:  %[[C6:.*]] = arith.constant 4 : index
-!HOST:  %[[BOUNDS1:.*]] = omp.bounds   lower_bound(%[[C5]] : index) upper_bound(%[[C6]] : index) stride(%[[C4]] : index) start_idx(%[[C4]] : index)
-!HOST:  %[[MAP1:.*]] = omp.map_info var_ptr(%[[WRITE]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_write(2:5)"}
-!HOST:  omp.target   map_entries(%[[MAP0]], %[[MAP1]] : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) {
+!ALL-LABEL: func.func @_QPread_write_section(
+!ALL:  %[[ITER:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFread_write_sectionEi"}
+!ALL:  %[[READ:.*]] = fir.address_of(@_QFread_write_sectionEsp_read) : !fir.ref<!fir.array<10xi32>>
+!ALL:  %[[WRITE:.*]] = fir.address_of(@_QFread_write_sectionEsp_write) : !fir.ref<!fir.array<10xi32>>
+!ALL:  %[[C1:.*]] = arith.constant 1 : index
+!ALL:  %[[C2:.*]] = arith.constant 1 : index
+!ALL:  %[[C3:.*]] = arith.constant 4 : index
+!ALL:  %[[BOUNDS0:.*]] = omp.bounds   lower_bound(%[[C2]] : index) upper_bound(%[[C3]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
+!ALL:  %[[MAP0:.*]] = omp.map_info var_ptr(%[[READ]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_read(2:5)"}
+!ALL:  %[[C4:.*]] = arith.constant 1 : index
+!ALL:  %[[C5:.*]] = arith.constant 1 : index
+!ALL:  %[[C6:.*]] = arith.constant 4 : index
+!ALL:  %[[BOUNDS1:.*]] = omp.bounds   lower_bound(%[[C5]] : index) upper_bound(%[[C6]] : index) stride(%[[C4]] : index) start_idx(%[[C4]] : index)
+!ALL:  %[[MAP1:.*]] = omp.map_info var_ptr(%[[WRITE]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_write(2:5)"}
+!ALL:  %[[MAP2:.*]] = omp.map_info var_ptr(%[[ITER]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
+!ALL: omp.target map_entries(%[[MAP0]] -> %{{.*}}, %[[MAP1]] -> %{{.*}}, %[[MAP2]] -> %{{.*}}, %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, !fir.ref<i32>, index, index) {
 
 subroutine read_write_section()
     integer :: sp_read(10) = (/1,2,3,4,5,6,7,8,9,10/)
@@ -44,33 +29,22 @@ subroutine read_write_section()
 !$omp end target
 end subroutine read_write_section
 
-
 module assumed_array_routines
-    contains
-!DEVICE-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array_omp_outline_0(
-!DEVICE-SAME: %[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG2:.*]]: !fir.ref<!fir.array<?xi32>>) attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, omp.outline_parent_name = "_QMassumed_array_routinesPassumed_shape_array"} {
-!DEVICE: %[[C0:.*]] = arith.constant 1 : index
-!DEVICE: %[[C1:.*]] = arith.constant 4 : index
-!DEVICE: %[[C2:.*]] = arith.constant 0 : index
-!DEVICE: %[[C3:.*]]:3 = fir.box_dims %[[ARG1]], %[[C2]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-!DEVICE: %[[C4:.*]] = arith.constant 1 : index
-!DEVICE: %[[BOUNDS:.*]] = omp.bounds   lower_bound(%[[C0]] : index) upper_bound(%[[C1]] : index) stride(%[[C3]]#2 : index) start_idx(%[[C4]] : index) {stride_in_bytes = true}
-!DEVICE: %[[ARGADDR:.*]] = fir.box_addr %[[ARG1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
-!DEVICE: %[[MAP:.*]] = omp.map_info var_ptr(%[[ARGADDR]] : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
-!DEVICE: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<?xi32>>) {
+contains
+!ALL-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array(
+!ALL-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) {
+!ALL: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEi"}
+!ALL: %[[C0:.*]] = arith.constant 1 : index
+!ALL: %[[C1:.*]] = arith.constant 0 : index
+!ALL: %[[C2:.*]]:3 = fir.box_dims %arg0, %[[C1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+!ALL: %[[C3:.*]] = arith.constant 1 : index
+!ALL: %[[C4:.*]] = arith.constant 4 : index
+!ALL: %[[BOUNDS:.*]] = omp.bounds   lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) stride(%[[C2]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true}
+!ALL: %[[ADDROF:.*]] = fir.box_addr %arg0 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+!ALL: %[[MAP:.*]] = omp.map_info var_ptr(%[[ADDROF]] : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
+!ALL: %[[MAP2:.*]] = omp.map_info var_ptr(%[[ALLOCA]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
+!ALL: omp.target map_entries(%[[MAP]] -> %{{.*}}, %[[MAP2]] -> %{{.*}} : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
 
-!HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array(
-!HOST-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) {
-!HOST: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEi"}
-!HOST: %[[C0:.*]] = arith.constant 1 : index
-!HOST: %[[C1:.*]] = arith.constant 0 : index
-!HOST: %[[C2:.*]]:3 = fir.box_dims %arg0, %[[C1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-!HOST: %[[C3:.*]] = arith.constant 1 : index
-!HOST: %[[C4:.*]] = arith.constant 4 : index
-!HOST: %[[BOUNDS:.*]] = omp.bounds   lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) stride(%[[C2]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true}
-!HOST: %[[ADDROF:.*]] = fir.box_addr %arg0 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
-!HOST: %[[MAP:.*]] = omp.map_info var_ptr(%[[ADDROF]] : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
-!HOST: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<?xi32>>) {
     subroutine assumed_shape_array(arr_read_write)
             integer, intent(inout) :: arr_read_write(:)
 
@@ -81,25 +55,17 @@ subroutine assumed_shape_array(arr_read_write)
         !$omp end target
         end subroutine assumed_shape_array
 
-!DEVICE-LABEL:   func.func @_QMassumed_array_routinesPassumed_size_array_omp_outline_0(
-!DEVICE-SAME:    %[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<!fir.array<?xi32>>) attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, omp.outline_parent_name = "_QMassumed_array_routinesPassumed_size_array"} {
-!DEVICE: %[[C0:.*]] = arith.constant 1 : index
-!DEVICE: %[[C1:.*]] = arith.constant 4 : index
-!DEVICE: %[[C2:.*]] = arith.constant 1 : index
-!DEVICE: %[[C3:.*]] = arith.constant 1 : index
-!DEVICE: %[[BOUNDS:.*]] = omp.bounds   lower_bound(%[[C0]] : index) upper_bound(%[[C1]] : index) stride(%[[C3]] : index) start_idx(%[[C3]] : index)
-!DEVICE: %[[MAP:.*]] = omp.map_info var_ptr(%[[ARG1]] : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
-!DEVICE: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<?xi32>>) {
+!ALL-LABEL:   func.func @_QMassumed_array_routinesPassumed_size_array(
+!ALL-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) {
+!ALL: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEi"}
+!ALL: %[[C0:.*]] = arith.constant 1 : index
+!ALL: %[[C1:.*]] = arith.constant 1 : index
+!ALL: %[[C2:.*]] = arith.constant 4 : index
+!ALL: %[[BOUNDS:.*]]  = omp.bounds   lower_bound(%[[C1]] : index) upper_bound(%[[C2]] : index) stride(%[[C0]] : index) start_idx(%[[C0]] : index)
+!ALL: %[[MAP:.*]] = omp.map_info var_ptr(%[[ARG0]] : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
+!ALL: %[[MAP2:.*]] = omp.map_info var_ptr(%[[ALLOCA]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
+!ALL: omp.target map_entries(%[[MAP]] -> %{{.*}}, %[[MAP2]] -> %{{.*}}, %{{.*}} -> %{{.*}} : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>, index) {
 
-!HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_size_array(
-!HOST-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) {
-!HOST: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEi"}
-!HOST: %[[C0:.*]] = arith.constant 1 : index
-!HOST: %[[C1:.*]] = arith.constant 1 : index
-!HOST: %[[C2:.*]] = arith.constant 4 : index
-!HOST: %[[BOUNDS:.*]]  = omp.bounds   lower_bound(%[[C1]] : index) upper_bound(%[[C2]] : index) stride(%[[C0]] : index) start_idx(%[[C0]] : index)
-!HOST: %[[MAP:.*]] = omp.map_info var_ptr(%[[ARG0]] : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
-!HOST: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<?xi32>>) {
         subroutine assumed_size_array(arr_read_write)
             integer, intent(inout) :: arr_read_write(*)
 
@@ -111,6 +77,7 @@ subroutine assumed_size_array(arr_read_write)
         end subroutine assumed_size_array
     end module assumed_array_routines
 
+!DEVICE-NOT:func.func @_QPcall_assumed_shape_and_size_array() {
 
 !HOST-LABEL:func.func @_QPcall_assumed_shape_and_size_array() {
 !HOST:%{{.*}} = arith.constant 20 : index
diff --git a/flang/test/Lower/OpenMP/FIR/location.f90 b/flang/test/Lower/OpenMP/FIR/location.f90
index 0e36e09b19e194..64837783767032 100644
--- a/flang/test/Lower/OpenMP/FIR/location.f90
+++ b/flang/test/Lower/OpenMP/FIR/location.f90
@@ -17,7 +17,7 @@ subroutine sub_parallel()
 !CHECK-LABEL: sub_target
 subroutine sub_target()
   print *, x
-!CHECK: omp.target  {
+!CHECK: omp.target {{.*}} {
   !$omp target
     print *, x
 !CHECK:   omp.terminator loc(#[[TAR_LOC:.*]])
diff --git a/flang/test/Lower/OpenMP/FIR/target.f90 b/flang/test/Lower/OpenMP/FIR/target.f90
index 95c57c9a224014..d5a8fb242de921 100644
--- a/flang/test/Lower/OpenMP/FIR/target.f90
+++ b/flang/test/Lower/OpenMP/FIR/target.f90
@@ -189,13 +189,14 @@ subroutine omp_target
    integer :: a(1024)
    !CHECK: %[[BOUNDS:.*]] = omp.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
    !CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>) {
+   !CHECK: omp.target   map_entries(%[[MAP]] -> %[[ARG_0:.*]], %{{.*}} -> %{{.*}} : !fir.ref<!fir.array<1024xi32>>, index) {
+   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<!fir.array<1024xi32>>, %{{.*}}: index):
    !$omp target map(tofrom: a)
       !CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32
       !CHECK: %[[VAL_2:.*]] = arith.constant 1 : i64
       !CHECK: %[[VAL_3:.*]] = arith.constant 1 : i64
       !CHECK: %[[VAL_4:.*]] = arith.subi %[[VAL_2]], %[[VAL_3]] : i64
-      !CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_4]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
+      !CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0]], %[[VAL_4]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
       !CHECK: fir.store %[[VAL_1]] to %[[VAL_5]] : !fir.ref<i32>
       a(1) = 10
    !CHECK: omp.terminator
@@ -203,6 +204,72 @@ subroutine omp_target
    !CHECK: }
 end subroutine omp_target
 
+!===============================================================================
+! Target implicit capture
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_implicit() {
+subroutine omp_target_implicit
+   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_implicitEa"}
+   integer :: a(1024)
+   !CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(implicit, tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+   !CHECK: omp.target   map_entries(%[[MAP]] -> %[[ARG_0:.*]], %{{.*}} -> %{{.*}} : !fir.ref<!fir.array<1024xi32>>, index) {
+   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<!fir.array<1024xi32>>, %{{.*}}: index):
+   !$omp target
+      !CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0]], %{{.*}} : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
+      a(1) = 10
+   !CHECK: omp.terminator
+   !$omp end target
+   !CHECK: }
+end subroutine omp_target_implicit
+
+!===============================================================================
+! Target implicit capture nested
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_implicit_nested() {
+subroutine omp_target_implicit_nested
+   integer::a, b
+   !CHECK: omp.target   map_entries(%{{.*}} -> %[[ARG0:.*]], %{{.*}} -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !CHECK: ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: !fir.ref<i32>):
+   !$omp target
+      !CHECK: fir.store %{{.*}} to %[[ARG0]] : !fir.ref<i32>
+      a = 10
+      !$omp parallel
+         !CHECK: fir.store %{{.*}} to %[[ARG1]] : !fir.ref<i32>
+         b = 20
+         !CHECK: omp.terminator
+      !$omp end parallel
+   !CHECK: omp.terminator
+   !$omp end target
+   !CHECK: }
+end subroutine omp_target_implicit_nested
+
+!===============================================================================
+! Target implicit capture with bounds
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_implicit_bounds(%{{.*}}: !fir.ref<i32> {fir.bindc_name = "n"}) {
+subroutine omp_target_implicit_bounds(n)
+   !CHECK: %[[VAL_1:.*]] = arith.select %{{.*}}, %{{.*}}, %{{.*}} : index
+   !CHECK: %[[VAL_2:.*]] = arith.select %{{.*}}, %{{.*}}, %{{.*}} : index
+   !CHECK: %[[VAL_3:.*]] = fir.alloca !fir.array<?x1024xi32>, %[[VAL_1]] {bindc_name = "a", uniq_name = "_QFomp_target_implicit_boundsEa"}
+   integer :: n
+   integer :: a(n, 1024)
+   !CHECK: %[[VAL_4:.*]] = omp.map_info var_ptr(%[[VAL_3]] : !fir.ref<!fir.array<?x1024xi32>>, !fir.array<?x1024xi32>)   map_clauses(implicit, tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.ref<!fir.array<?x1024xi32>> {name = "a"}
+   !CHECK: %[[VAL_5:.*]] = omp.map_info val(%[[VAL_1]] : index)  map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> index {name = ""}
+   !CHECK: %[[VAL_6:.*]] = omp.map_info val(%[[VAL_2]] : index)  map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> index {name = ""}
+   !CHECK: omp.target   map_entries(%[[VAL_4]] -> %[[ARG_1:.*]], %[[VAL_5]] -> %[[ARG_2:.*]], %[[VAL_6]] -> %[[ARG_3:.*]] : !fir.ref<!fir.array<?x1024xi32>>, index, index) {
+   !CHECK: ^bb0(%[[ARG_1]]: !fir.ref<!fir.array<?x1024xi32>>, %[[ARG_2]]: index, %[[ARG_3]]: index):
+   !$omp target
+      !CHECK: %{{.*}} = fir.convert %[[ARG_1]] : (!fir.ref<!fir.array<?x1024xi32>>) -> !fir.ref<!fir.array<?xi32>>
+      !CHECK: %{{.*}} = arith.muli %{{.*}}, %[[ARG_2]] : index
+      a(11,22) = 33
+      !CHECK: omp.terminator
+   !$omp end target
+!CHECK: }
+end subroutine omp_target_implicit_bounds
+
 !===============================================================================
 ! Target `thread_limit` clause
 !===============================================================================
@@ -212,7 +279,8 @@ subroutine omp_target_thread_limit
    integer :: a
    !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
    !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
-   !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] : !fir.ref<i32>) {
+   !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref<i32>) {
+   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<i32>):
    !$omp target map(tofrom: a) thread_limit(64)
       a = 10
    !CHECK: omp.terminator
@@ -274,23 +342,25 @@ subroutine omp_target_parallel_do
    !CHECK: %[[C0:.*]] = arith.constant 0 : index
    !CHECK: %[[SUB:.*]] = arith.subi %[[C1024]], %[[C1]] : index
    !CHECK: %[[BOUNDS:.*]] = omp.bounds   lower_bound(%[[C0]] : index) upper_bound(%[[SUB]] : index) extent(%[[C1024]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
-   !CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>) {
+   !CHECK: %[[MAP1:.*]] = omp.map_info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+   !CHECK: %[[MAP2:.*]] = omp.map_info var_ptr(%[[VAL_1]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
+   !CHECK: omp.target map_entries(%[[MAP1]] -> %[[VAL_2:.*]], %[[MAP2]] -> %[[VAL_3:.*]], %{{.*}} -> %{{.*}} : !fir.ref<!fir.array<1024xi32>>, !fir.ref<i32>, index) {
+   !CHECK: ^bb0(%[[VAL_2]]: !fir.ref<!fir.array<1024xi32>>, %[[VAL_3]]: !fir.ref<i32>, %{{.*}}: index):
       !CHECK-NEXT: omp.parallel
       !$omp target parallel do map(tofrom: a)
-         !CHECK: %[[VAL_2:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-         !CHECK: %[[VAL_3:.*]] = arith.constant 1 : i32
-         !CHECK: %[[VAL_4:.*]] = arith.constant 1024 : i32
+         !CHECK: %[[VAL_4:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
          !CHECK: %[[VAL_5:.*]] = arith.constant 1 : i32
-         !CHECK: omp.wsloop   for  (%[[VAL_6:.*]]) : i32 = (%[[VAL_3]]) to (%[[VAL_4]]) inclusive step (%[[VAL_5]]) {
-         !CHECK: fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<i32>
-         !CHECK: %[[VAL_7:.*]] = arith.constant 10 : i32
-         !CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-         !CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i32) -> i64
-         !CHECK: %[[VAL_10:.*]] = arith.constant 1 : i64
-         !CHECK: %[[VAL_11:.*]] = arith.subi %[[VAL_9]], %[[VAL_10]] : i64
-         !CHECK: %[[VAL_12:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_11]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
-         !CHECK: fir.store %[[VAL_7]] to %[[VAL_12]] : !fir.ref<i32>
+         !CHECK: %[[VAL_6:.*]] = arith.constant 1024 : i32
+         !CHECK: %[[VAL_7:.*]] = arith.constant 1 : i32
+         !CHECK: omp.wsloop   for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_5]]) to (%[[VAL_6]]) inclusive step (%[[VAL_7]]) {
+         !CHECK: fir.store %[[VAL_8]] to %[[VAL_4]] : !fir.ref<i32>
+         !CHECK: %[[VAL_9:.*]] = arith.constant 10 : i32
+         !CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
+         !CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> i64
+         !CHECK: %[[VAL_12:.*]] = arith.constant 1 : i64
+         !CHECK: %[[VAL_13:.*]] = arith.subi %[[VAL_11]], %[[VAL_12]] : i64
+         !CHECK: %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_2]], %[[VAL_13]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
+         !CHECK: fir.store %[[VAL_9]] to %[[VAL_14]] : !fir.ref<i32>
          do i = 1, 1024
             a(i) = 10
          end do
@@ -301,4 +371,4 @@ subroutine omp_target_parallel_do
    !CHECK: omp.terminator
    !CHECK: }
    !$omp end target parallel do
-end subroutine omp_target_parallel_do
+ end subroutine omp_target_parallel_do
diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90
index e65f17f74f5909..d0c584bec6044a 100644
--- a/flang/test/Lower/OpenMP/array-bounds.f90
+++ b/flang/test/Lower/OpenMP/array-bounds.f90
@@ -22,7 +22,7 @@
 !HOST:  %[[C6:.*]] = arith.constant 4 : index
 !HOST:  %[[BOUNDS1:.*]] = omp.bounds   lower_bound(%[[C5]] : index) upper_bound(%[[C6]] : index) stride(%[[C4]] : index) start_idx(%[[C4]] : index)
 !HOST:  %[[MAP1:.*]] = omp.map_info var_ptr(%[[WRITE_DECL]]#1 : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_write(2:5)"}
-!HOST:  omp.target   map_entries(%[[MAP0]], %[[MAP1]] : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) {
+!HOST:  omp.target map_entries(%[[MAP0]] -> %{{.*}}, %[[MAP1]] -> %{{.*}}, {{.*}} -> {{.*}}, {{.*}} -> {{.*}}, {{.*}} -> {{.*}} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, !fir.ref<i32>, index, index) {
 
 subroutine read_write_section()
     integer :: sp_read(10) = (/1,2,3,4,5,6,7,8,9,10/)
@@ -50,7 +50,7 @@ module assumed_array_routines
 !HOST: %[[BOUNDS:.*]] = omp.bounds   lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) stride(%[[C2]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true}
 !HOST: %[[ADDROF:.*]] = fir.box_addr %[[ARG0_DECL]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
 !HOST: %[[MAP:.*]] = omp.map_info var_ptr(%[[ADDROF]] : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
-!HOST: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<?xi32>>) {
+!HOST: omp.target   map_entries(%[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}} : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
     subroutine assumed_shape_array(arr_read_write)
             integer, intent(inout) :: arr_read_write(:)
 
@@ -73,7 +73,7 @@ end subroutine assumed_shape_array
 !HOST: %[[C2:.*]] = arith.constant 4 : index
 !HOST: %[[BOUNDS:.*]]  = omp.bounds   lower_bound(%[[C1]] : index) upper_bound(%[[C2]] : index) stride(%[[C0]] : index) start_idx(%[[C0]] : index)
 !HOST: %[[MAP:.*]] = omp.map_info var_ptr(%[[ARG0_DECL]]#1 : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
-!HOST: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<?xi32>>) {
+!HOST: omp.target map_entries(%[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}}, {{.*}} -> {{.*}} : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>, index) {
         subroutine assumed_size_array(arr_read_write)
             integer, intent(inout) :: arr_read_write(*)
 
diff --git a/flang/test/Lower/OpenMP/location.f90 b/flang/test/Lower/OpenMP/location.f90
index c87bf038e96721..1e01a4828dd9e1 100644
--- a/flang/test/Lower/OpenMP/location.f90
+++ b/flang/test/Lower/OpenMP/location.f90
@@ -17,7 +17,7 @@ subroutine sub_parallel()
 !CHECK-LABEL: sub_target
 subroutine sub_target()
   print *, x
-!CHECK: omp.target  {
+!CHECK: omp.target {{.*}} {
   !$omp target
     print *, x
 !CHECK:   omp.terminator loc(#[[TAR_LOC:.*]])
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 64e740a818fe03..86f456b847df90 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -187,23 +187,112 @@ end subroutine omp_target_data_mt
 
 !CHECK-LABEL: func.func @_QPomp_target() {
 subroutine omp_target
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_targetEa"}
-   !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %[[VAL_0]](%{{.*}}) {uniq_name = "_QFomp_targetEa"} : (!fir.ref<!fir.array<1024xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
+   !CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_targetEa"} : (!fir.ref<!fir.array<1024xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
    integer :: a(1024)
    !CHECK: %[[BOUNDS:.*]] = omp.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[VAL_0_DECL]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>) {
+   !CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[VAL_1]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+   !CHECK: omp.target   map_entries(%[[MAP]] -> %[[ARG_0:.*]], %{{.*}} -> %[[ARG_1:.*]] : !fir.ref<!fir.array<1024xi32>>, index) {
+   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<!fir.array<1024xi32>>, %[[ARG_1]]: index):
    !$omp target map(tofrom: a)
-      !CHECK: %[[C10:.*]] = arith.constant 10 : i32
-      !CHECK: %[[C1:.*]] = arith.constant 1 : index
-      !CHECK: %[[A_1:.*]] = hlfir.designate %[[VAL_0_DECL]]#0 (%[[C1]])  : (!fir.ref<!fir.array<1024xi32>>, index) -> !fir.ref<i32>
-      !CHECK: hlfir.assign %[[C10]] to %[[A_1]] : i32, !fir.ref<i32>
+      !CHECK: %[[VAL_2:.*]] = fir.shape %[[ARG_1]] : (index) -> !fir.shape<1>
+      !CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG_0]](%[[VAL_2]]) {uniq_name = "_QFomp_targetEa"} : (!fir.ref<!fir.array<1024xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
+      !CHECK: %[[VAL_4:.*]] = arith.constant 10 : i32
+      !CHECK: %[[VAL_5:.*]] = arith.constant 1 : index
+      !CHECK: %[[VAL_6:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_5]])  : (!fir.ref<!fir.array<1024xi32>>, index) -> !fir.ref<i32>
+      !CHECK: hlfir.assign %[[VAL_4]] to %[[VAL_6]] : i32, !fir.ref<i32>
       a(1) = 10
-   !CHECK: omp.terminator
+      !CHECK: omp.terminator
    !$omp end target
    !CHECK: }
 end subroutine omp_target
 
+!===============================================================================
+! Target implicit capture
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_implicit() {
+subroutine omp_target_implicit
+   !CHECK: %[[VAL_0:.*]] = arith.constant 1024 : index
+   !CHECK: %[[VAL_1:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_implicitEa"}
+   !CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
+   !CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_2]]) {uniq_name = "_QFomp_target_implicitEa"} : (!fir.ref<!fir.array<1024xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
+   integer :: a(1024)
+   !CHECK: %[[VAL_4:.*]] = omp.map_info var_ptr(%[[VAL_3]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(implicit, tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+   !CHECK: %[[VAL_5:.*]] = omp.map_info val(%[[VAL_0]] : index)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> index {name = ""}
+   !CHECK: omp.target   map_entries(%[[VAL_4]] -> %[[VAL_6:.*]], %[[VAL_5]] -> %[[VAL_7:.*]] : !fir.ref<!fir.array<1024xi32>>, index) {
+   !CHECK: ^bb0(%[[VAL_6]]: !fir.ref<!fir.array<1024xi32>>, %[[VAL_7]]: index):
+   !$omp target
+      !CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+      !CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_6]](%[[VAL_8]]) {uniq_name = "_QFomp_target_implicitEa"} : (!fir.ref<!fir.array<1024xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
+      !CHECK: %[[VAL_10:.*]] = arith.constant 10 : i32
+      !CHECK: %[[VAL_11:.*]] = arith.constant 1 : index
+      !CHECK: %[[VAL_12:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_11]])  : (!fir.ref<!fir.array<1024xi32>>, index) -> !fir.ref<i32>
+      !CHECK: hlfir.assign %[[VAL_10]] to %[[VAL_12]] : i32, !fir.ref<i32>
+      a(1) = 10
+      !CHECK: omp.terminator
+   !$omp end target
+   !CHECK: }
+end subroutine omp_target_implicit
+
+!===============================================================================
+! Target implicit capture nested
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_implicit_nested() {
+subroutine omp_target_implicit_nested
+   integer::a, b
+   !CHECK: omp.target   map_entries(%{{.*}} -> %[[ARG0:.*]], %{{.*}} -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !CHECK: ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: !fir.ref<i32>):
+   !$omp target
+      !CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFomp_target_implicit_nestedEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      !CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFomp_target_implicit_nestedEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      !CHECK: %[[VAL_10:.*]] = arith.constant 10 : i32
+      !CHECK: hlfir.assign %[[VAL_10]] to %[[VAL_8]]#0 : i32, !fir.ref<i32>
+      a = 10
+      !CHECK: omp.parallel
+      !$omp parallel
+         !CHECK: %[[VAL_11:.*]] = arith.constant 20 : i32
+         !CHECK: hlfir.assign %[[VAL_11]] to %[[VAL_9]]#0 : i32, !fir.ref<i32>
+         b = 20
+         !CHECK: omp.terminator
+      !$omp end parallel
+      !CHECK: omp.terminator
+   !$omp end target
+   !CHECK: }
+end subroutine omp_target_implicit_nested
+
+!===============================================================================
+! Target implicit capture with bounds
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_implicit_bounds(%{{.*}}: !fir.ref<i32> {fir.bindc_name = "n"}) {
+subroutine omp_target_implicit_bounds(n)
+   !CHECK: %[[VAL_1:.*]] = arith.select %{{.*}}, %{{.*}}, %{{.*}} : index
+   !CHECK: %[[VAL_2:.*]] = arith.select %{{.*}}, %{{.*}}, %{{.*}} : index
+   !CHECK: %[[VAL_3:.*]] = fir.alloca !fir.array<?x1024xi32>, %[[VAL_1]] {bindc_name = "a", uniq_name = "_QFomp_target_implicit_boundsEa"}
+   !CHECK: %[[VAL_4:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
+   !CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_4]]) {uniq_name = "_QFomp_target_implicit_boundsEa"} : (!fir.ref<!fir.array<?x1024xi32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x1024xi32>>, !fir.ref<!fir.array<?x1024xi32>>)
+   integer :: n
+   integer :: a(n, 1024)
+   !CHECK: %[[VAL_6:.*]] = omp.map_info var_ptr(%[[VAL_5]]#1 : !fir.ref<!fir.array<?x1024xi32>>, !fir.array<?x1024xi32>)   map_clauses(implicit, tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.ref<!fir.array<?x1024xi32>> {name = "a"}
+   !CHECK: %[[VAL_7:.*]] = omp.map_info val(%[[VAL_1]] : index)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> index {name = ""}
+   !CHECK: %[[VAL_8:.*]] = omp.map_info val(%[[VAL_2]] : index)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> index {name = ""}
+   !CHECK: omp.target   map_entries(%[[VAL_6]] -> %[[ARG_1:.*]], %[[VAL_7]] -> %[[ARG_2:.*]], %[[VAL_8]] -> %[[ARG_3:.*]] : !fir.ref<!fir.array<?x1024xi32>>, index, index) {
+   !CHECK: ^bb0(%[[ARG_1]]: !fir.ref<!fir.array<?x1024xi32>>, %[[ARG_2]]: index, %[[ARG_3]]: index):
+   !$omp target
+      !CHECK: %[[VAL_9:.*]] = fir.shape %[[ARG_2]], %[[ARG_3]] : (index, index) -> !fir.shape<2>
+      !CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[ARG_1]](%[[VAL_9]]) {uniq_name = "_QFomp_target_implicit_boundsEa"} : (!fir.ref<!fir.array<?x1024xi32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x1024xi32>>, !fir.ref<!fir.array<?x1024xi32>>)
+      !CHECK: %[[VAL_11:.*]] = arith.constant 33 : i32
+      !CHECK: %[[VAL_12:.*]] = arith.constant 11 : index
+      !CHECK: %[[VAL_13:.*]] = arith.constant 22 : index
+      !CHECK: %[[VAL_14:.*]] = hlfir.designate %[[VAL_10]]#0 (%[[VAL_12]], %[[VAL_13]])  : (!fir.box<!fir.array<?x1024xi32>>, index, index) -> !fir.ref<i32>
+      !CHECK: hlfir.assign %[[VAL_11]] to %[[VAL_14]] : i32, !fir.ref<i32>
+      a(11, 22) = 33
+      !CHECK: omp.terminator
+   !$omp end target
+!CHECK: }
+end subroutine omp_target_implicit_bounds
+
 !===============================================================================
 ! Target `thread_limit` clause
 !===============================================================================
@@ -213,7 +302,8 @@ subroutine omp_target_thread_limit
    integer :: a
    !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
    !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
-   !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] : !fir.ref<i32>) {
+   !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %{{.*}} : !fir.ref<i32>) {
+   !CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
    !$omp target map(tofrom: a) thread_limit(64)
       a = 10
    !CHECK: omp.terminator
@@ -306,8 +396,10 @@ subroutine omp_target_parallel_do
    !CHECK: %[[SUB:.*]] = arith.subi %[[C1024]], %[[C1]] : index
    !CHECK: %[[BOUNDS:.*]] = omp.bounds   lower_bound(%[[C0]] : index) upper_bound(%[[SUB]] : index) extent(%[[C1024]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
    !CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[VAL_0_DECL]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>) {
-      !CHECK-NEXT: omp.parallel
+   !CHECK: omp.target   map_entries(%[[MAP]] -> %[[ARG_0:.*]], %{{.*}} -> %{{.*}}, %{{.*}} -> %{{.*}} : !fir.ref<!fir.array<1024xi32>>, !fir.ref<i32>, index) {
+   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<!fir.array<1024xi32>>, %{{.*}}: !fir.ref<i32>, %{{.*}}: index):
+      !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %[[ARG_0]](%{{.*}}) {uniq_name = "_QFomp_target_parallel_doEa"} : (!fir.ref<!fir.array<1024xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
+      !CHECK: omp.parallel
       !$omp target parallel do map(tofrom: a)
          !CHECK: %[[I_PVT_ALLOCA:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
          !CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_ALLOCA]] {uniq_name = "_QFomp_target_parallel_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -329,3 +421,31 @@ subroutine omp_target_parallel_do
    !CHECK: }
    !$omp end target parallel do
 end subroutine omp_target_parallel_do
+
+!===============================================================================
+! Target with unstructured code
+!===============================================================================
+
+!CHECK-LABEL:   func.func @_QPtarget_unstructured() {
+subroutine target_unstructured
+   !CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtarget_unstructuredEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+   integer :: i = 1
+   !CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtarget_unstructuredEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+   integer :: j = 11
+   !CHECK: %[[VAL_4:.*]] = omp.map_info var_ptr(%[[VAL_1]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
+   !CHECK: %[[VAL_5:.*]] = omp.map_info var_ptr(%[[VAL_3]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "j"}
+   !CHECK: omp.target map_entries(%[[VAL_4]] -> %[[VAL_6:.*]], %[[VAL_5]] -> %[[VAL_7:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !CHECK: ^bb0(%[[VAL_6]]: !fir.ref<i32>, %[[VAL_7]]: !fir.ref<i32>):
+   !$omp target
+      !CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFtarget_unstructuredEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      !CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFtarget_unstructuredEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      !CHECK: ^bb1:
+      do while (i <= j)
+         !CHECK: ^bb2:
+         i = i + 1
+      end do
+      !CHECK: ^bb3:
+      !CHECK: omp.terminator
+   !$omp end target
+   !CHECK: }
+end subroutine target_unstructured
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 72121ad4f9e47a..99ac5cfb7b9e92 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -1144,15 +1144,15 @@ def DataBoundsOp : OpenMP_Op<"bounds",
 }
 
 def MapInfoOp : OpenMP_Op<"map_info", [AttrSizedOperandSegments]> {
-  let arguments = (ins OpenMP_PointerLikeType:$var_ptr,
-                       TypeAttr:$var_type,
+  let arguments = (ins Optional<AnyType>:$val,
+                       Optional<OpenMP_PointerLikeType>:$var_ptr,
+                       OptionalAttr<TypeAttr>:$var_type,
                        Optional<OpenMP_PointerLikeType>:$var_ptr_ptr,
                        Variadic<DataBoundsType>:$bounds, /* rank-0 to rank-{n-1} */
                        OptionalAttr<UI64Attr>:$map_type,
                        OptionalAttr<VariableCaptureKindAttr>:$map_capture_type,
-                       DefaultValuedAttr<BoolAttr, "false">:$implicit,
                        OptionalAttr<StrAttr>:$name);
-  let results = (outs OpenMP_PointerLikeType:$omp_ptr);
+  let results = (outs AnyType:$omp_ptr);
 
   let description = [{
     The MapInfoOp captures information relating to individual OpenMP map clauses
@@ -1178,22 +1178,20 @@ def MapInfoOp : OpenMP_Op<"map_info", [AttrSizedOperandSegments]> {
     ```
     =>
     ```mlir
-    omp.map_info var_ptr(%index_ssa) map_type(to) map_capture_type(ByRef) implicit(false)
+    omp.map_info var_ptr(%index_ssa) map_type(to) map_capture_type(ByRef)
       name(index)
     ```
 
     Description of arguments:
+    - `val`: The value to copy.
     - `var_ptr`: The address of variable to copy.
-    - `var_type`: The type of the variable to copy.
+    - `var_type`: The type of the variable/value to copy.
     - `var_ptr_ptr`: Used when the variable copied is a member of a class, structure
       or derived type and refers to the originating struct.
     - `bounds`: Used when copying slices of array's, pointers or pointer members of
       objects (e.g. derived types or classes), indicates the bounds to be copied
       of the variable. When it's an array slice it is in rank order where rank 0
       is the inner-most dimension.
-    - `implicit`: indicates where the map item has been specified explicitly in a
-      map clause or captured implicitly by being used in a target region with no
-      map or other data mapping construct.
     - 'map_clauses': OpenMP map type for this map capture, for example: from, to and
       always. It's a bitfield composed of the OpenMP runtime flags stored in
       OpenMPOffloadMappingFlags.
@@ -1203,9 +1201,10 @@ def MapInfoOp : OpenMP_Op<"map_info", [AttrSizedOperandSegments]> {
   }];
 
   let assemblyFormat = [{
-    `var_ptr` `(` $var_ptr `:` type($var_ptr) `,` $var_type`)`
     oilist(
-        `var_ptr_ptr` `(` $var_ptr_ptr `:` type($var_ptr_ptr) `)`
+        `val` `(` $val `:` type($val) `)`
+      | `var_ptr` `(` $var_ptr `:` type($var_ptr) `,` $var_type `)`
+      | `var_ptr_ptr` `(` $var_ptr_ptr `:` type($var_ptr_ptr) `)`
       | `map_clauses` `(` custom<MapClause>($map_type) `)`
       | `capture` `(` custom<CaptureType>($map_capture_type) `)`
       | `bounds` `(` $bounds `)`
@@ -1265,7 +1264,7 @@ def Target_DataOp: OpenMP_Op<"target_data", [AttrSizedOperandSegments]>{
                        Optional<AnyInteger>:$device,
                        Variadic<OpenMP_PointerLikeType>:$use_device_ptr,
                        Variadic<OpenMP_PointerLikeType>:$use_device_addr,
-                       Variadic<OpenMP_PointerLikeType>:$map_operands);
+                       Variadic<AnyType>:$map_operands);
 
   let regions = (region AnyRegion:$region);
 
@@ -1314,7 +1313,7 @@ def Target_EnterDataOp: OpenMP_Op<"target_enter_data",
   let arguments = (ins Optional<I1>:$if_expr,
                        Optional<AnyInteger>:$device,
                        UnitAttr:$nowait,
-                       Variadic<OpenMP_PointerLikeType>:$map_operands);
+                       Variadic<AnyType>:$map_operands);
 
   let assemblyFormat = [{
     oilist(`if` `(` $if_expr `:` type($if_expr) `)`
@@ -1360,7 +1359,7 @@ def Target_ExitDataOp: OpenMP_Op<"target_exit_data",
   let arguments = (ins Optional<I1>:$if_expr,
                        Optional<AnyInteger>:$device,
                        UnitAttr:$nowait,
-                       Variadic<OpenMP_PointerLikeType>:$map_operands);
+                       Variadic<AnyType>:$map_operands);
 
   let assemblyFormat = [{
     oilist(`if` `(` $if_expr `:` type($if_expr) `)`
@@ -1377,7 +1376,7 @@ def Target_ExitDataOp: OpenMP_Op<"target_exit_data",
 // 2.14.5 target construct
 //===----------------------------------------------------------------------===//
 
-def TargetOp : OpenMP_Op<"target",[OutlineableOpenMPOpInterface, AttrSizedOperandSegments]> {
+def TargetOp : OpenMP_Op<"target",[IsolatedFromAbove, OutlineableOpenMPOpInterface, AttrSizedOperandSegments]> {
   let summary = "target construct";
   let description = [{
     The target construct includes a region of code which is to be executed
@@ -1403,7 +1402,7 @@ def TargetOp : OpenMP_Op<"target",[OutlineableOpenMPOpInterface, AttrSizedOperan
                        Optional<AnyInteger>:$device,
                        Optional<AnyInteger>:$thread_limit,
                        UnitAttr:$nowait,
-                       Variadic<OpenMP_PointerLikeType>:$map_operands);
+                       Variadic<AnyType>:$map_operands);
 
   let regions = (region AnyRegion:$region);
 
@@ -1412,7 +1411,7 @@ def TargetOp : OpenMP_Op<"target",[OutlineableOpenMPOpInterface, AttrSizedOperan
     | `device` `(` $device `:` type($device) `)`
     | `thread_limit` `(` $thread_limit `:` type($thread_limit) `)`
     | `nowait` $nowait
-    | `map_entries` `(` $map_operands `:` type($map_operands) `)`
+    | `map_entries` `(` custom<MapEntries>($map_operands, type($map_operands)) `)`
     ) $region attr-dict
   }];
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 3b792a26d1823f..a56e75eae06bb0 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -691,6 +691,9 @@ static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) {
     if (mapTypeMod == "always")
       mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS;
 
+    if (mapTypeMod == "implicit")
+      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+
     if (mapTypeMod == "close")
       mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE;
 
@@ -737,6 +740,9 @@ static void printMapClause(OpAsmPrinter &p, Operation *op,
   if (mapTypeToBitFlag(mapTypeBits,
                        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS))
     mapTypeStrs.push_back("always");
+  if (mapTypeToBitFlag(mapTypeBits,
+                       llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT))
+    mapTypeStrs.push_back("implicit");
   if (mapTypeToBitFlag(mapTypeBits,
                        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE))
     mapTypeStrs.push_back("close");
@@ -777,6 +783,64 @@ static void printMapClause(OpAsmPrinter &p, Operation *op,
   }
 }
 
+static ParseResult
+parseMapEntries(OpAsmParser &parser,
+                SmallVectorImpl<OpAsmParser::UnresolvedOperand> &mapOperands,
+                SmallVectorImpl<Type> &mapOperandTypes) {
+  OpAsmParser::UnresolvedOperand arg;
+  OpAsmParser::UnresolvedOperand blockArg;
+  Type argType;
+  auto parseEntries = [&]() -> ParseResult {
+    if (parser.parseOperand(arg) || parser.parseArrow() ||
+        parser.parseOperand(blockArg))
+      return failure();
+    mapOperands.push_back(arg);
+    return success();
+  };
+
+  auto parseTypes = [&]() -> ParseResult {
+    if (parser.parseType(argType))
+      return failure();
+    mapOperandTypes.push_back(argType);
+    return success();
+  };
+
+  if (parser.parseCommaSeparatedList(parseEntries))
+    return failure();
+
+  if (parser.parseColon())
+    return failure();
+
+  if (parser.parseCommaSeparatedList(parseTypes))
+    return failure();
+
+  return success();
+}
+
+static void printMapEntries(OpAsmPrinter &p, Operation *op,
+                            OperandRange mapOperands,
+                            TypeRange mapOperandTypes) {
+  auto &region = op->getRegion(0);
+  unsigned argIndex = 0;
+
+  for (const auto &mapOp : mapOperands) {
+    const auto &blockArg = region.front().getArgument(argIndex);
+    p << mapOp << " -> " << blockArg;
+    argIndex++;
+    if (argIndex < mapOperands.size())
+      p << ", ";
+  }
+  p << " : ";
+
+  argIndex = 0;
+  for (const auto &mapType : mapOperandTypes) {
+    p << mapType;
+    argIndex++;
+    if (argIndex < mapOperands.size())
+      p << ", ";
+  }
+}
+
 static void printCaptureType(OpAsmPrinter &p, Operation *op,
                              VariableCaptureKindAttr mapCaptureType) {
   std::string typeCapStr;
@@ -823,6 +887,15 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapOperands) {
     if (auto MapInfoOp =
             mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp())) {
 
+      if (MapInfoOp.getVal() && MapInfoOp.getVarPtr())
+        emitError(op->getLoc(), "only one of val or var_ptr must be used");
+
+      if (!MapInfoOp.getVal() && !MapInfoOp.getVarPtr())
+        emitError(op->getLoc(), "missing val or var_ptr");
+
+      if (!MapInfoOp.getVarPtr() && MapInfoOp.getVarType().has_value())
+        emitError(op->getLoc(), "var_type supplied without var_ptr");
+
       if (!MapInfoOp.getMapType().has_value())
         emitError(op->getLoc(), "missing map type for map operand");
 

From 72e2387c0528be72999beabf532e6929d3054e03 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee@amd.com>
Date: Fri, 22 Sep 2023 18:13:46 +0100
Subject: [PATCH 12/40] [OpenMP][MLIR] Add "IsolatedFromAbove" trait to
 omp.target

This patch adds the MLIR translation changes required for add the IsolatedFromAbove and OutlineableOpenMPOpInterface traits to omp.target. It links the newly added block arguments to their corresponding llvm values.

Depends on #67164.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 51 +++++++++++++++----
 .../OpenMPToLLVM/convert-to-llvmir.mlir       | 20 +++++---
 mlir/test/Dialect/OpenMP/canonicalize.mlir    |  5 +-
 mlir/test/Dialect/OpenMP/invalid.mlir         | 40 ++++++++++++++-
 mlir/test/Dialect/OpenMP/ops.mlir             | 21 +++++---
 .../omptarget-array-sectioning-host.mlir      | 15 +++---
 ...target-byref-bycopy-generation-device.mlir |  9 ++--
 ...mptarget-byref-bycopy-generation-host.mlir |  7 +--
 .../omptarget-declare-target-llvm-device.mlir | 13 ++---
 .../LLVMIR/omptarget-region-device-llvm.mlir  |  9 ++--
 .../omptarget-region-llvm-target-device.mlir  |  9 ++--
 .../Target/LLVMIR/omptarget-region-llvm.mlir  |  9 ++--
 .../omptarget-region-parallel-llvm.mlir       |  9 ++--
 13 files changed, 156 insertions(+), 61 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 1daf60b8659bb6..8e741bf83be32f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1733,12 +1733,13 @@ void collectMapDataFromMapOperands(MapInfoData &mapData,
            "missing map info operation or incorrect map info operation type");
     if (auto mapOp = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
             mapValue.getDefiningOp())) {
-      mapData.OriginalValue.push_back(
-          moduleTranslation.lookupValue(mapOp.getVarPtr()));
+      mapData.OriginalValue.push_back(moduleTranslation.lookupValue(
+          mapOp.getVarPtr() ? mapOp.getVarPtr() : mapOp.getVal()));
       mapData.Pointers.push_back(mapData.OriginalValue.back());
 
       if (llvm::Value *refPtr = getRefPtrIfDeclareTarget(
-              mapOp.getVarPtr(), moduleTranslation)) { // declare target
+              mapOp.getVarPtr() ? mapOp.getVarPtr() : mapOp.getVal(),
+              moduleTranslation)) { // declare target
         mapData.IsDeclareTarget.push_back(true);
         mapData.BasePointers.push_back(refPtr);
       } else { // regular mapped variable
@@ -1746,10 +1747,14 @@ void collectMapDataFromMapOperands(MapInfoData &mapData,
         mapData.BasePointers.push_back(mapData.OriginalValue.back());
       }
 
-      mapData.Sizes.push_back(getSizeInBytes(dl, mapOp.getVarType(), mapOp,
-                                             builder, moduleTranslation));
-      mapData.BaseType.push_back(
-          moduleTranslation.convertType(mapOp.getVarType()));
+      mapData.Sizes.push_back(
+          getSizeInBytes(dl,
+                         mapOp.getVal() ? mapOp.getVal().getType()
+                                        : mapOp.getVarType().value(),
+                         mapOp, builder, moduleTranslation));
+      mapData.BaseType.push_back(moduleTranslation.convertType(
+          mapOp.getVal() ? mapOp.getVal().getType()
+                         : mapOp.getVarType().value()));
       mapData.MapClause.push_back(mapOp.getOperation());
       mapData.Types.push_back(
           llvm::omp::OpenMPOffloadMappingFlags(mapOp.getMapType().value()));
@@ -1796,6 +1801,13 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
     else if (isTargetParams)
       mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM;
 
+    if (auto mapInfoOp = dyn_cast<mlir::omp::MapInfoOp>(mapData.MapClause[i]))
+      if (mapInfoOp.getMapCaptureType().value() ==
+              mlir::omp::VariableCaptureKind::ByCopy &&
+          !(mapInfoOp.getVarType().has_value() &&
+            mapInfoOp.getVarType()->isa<LLVM::LLVMPointerType>()))
+        mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_LITERAL;
+
     combinedInfo.BasePointers.emplace_back(mapData.BasePointers[i]);
     combinedInfo.Pointers.emplace_back(mapData.Pointers[i]);
     combinedInfo.DevicePointers.emplace_back(mapData.DevicePointers[i]);
@@ -2318,6 +2330,19 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
   auto targetOp = cast<omp::TargetOp>(opInst);
   auto &targetRegion = targetOp.getRegion();
+  DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
+  SmallVector<Value> mapOperands = targetOp.getMapOperands();
+
+  // Remove mapOperands/blockArgs that have no use inside the region.
+  assert(mapOperands.size() == targetRegion.getNumArguments() &&
+         "Number of mapOperands must be same as block_arguments");
+  for (size_t i = 0; i < mapOperands.size(); i++) {
+    if (targetRegion.getArgument(i).use_empty()) {
+      targetRegion.eraseArgument(i);
+      mapOperands.erase(&mapOperands[i]);
+      i--;
+    }
+  }
 
   LogicalResult bodyGenStatus = success();
 
@@ -2325,6 +2350,16 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   auto bodyCB = [&](InsertPointTy allocaIP,
                     InsertPointTy codeGenIP) -> InsertPointTy {
     builder.restoreIP(codeGenIP);
+    unsigned argIndex = 0;
+    for (auto &mapOp : mapOperands) {
+      auto mapInfoOp =
+          mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp());
+      llvm::Value *mapOpValue = moduleTranslation.lookupValue(
+          mapInfoOp.getVarPtr() ? mapInfoOp.getVarPtr() : mapInfoOp.getVal());
+      const auto &arg = targetRegion.front().getArgument(argIndex);
+      moduleTranslation.mapValue(arg, mapOpValue);
+      argIndex++;
+    }
     llvm::BasicBlock *exitBlock = convertOmpOpRegions(
         targetRegion, "omp.target", builder, moduleTranslation, bodyGenStatus);
     builder.SetInsertPoint(exitBlock);
@@ -2352,8 +2387,6 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
 
-  DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
-  llvm::SmallVector<Value> mapOperands = targetOp.getMapOperands();
   MapInfoData mapData;
   collectMapDataFromMapOperands(mapData, mapOperands, moduleTranslation, dl,
                                 builder);
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index bbf50617edf944..3fbeaebb592a4d 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -244,10 +244,12 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr, %i : !llvm.ptr) {
 // CHECK:                             %[[ARG_0:.*]]: !llvm.ptr,
 // CHECK:                             %[[ARG_1:.*]]: !llvm.ptr) {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(64 : i32) : i32
-// CHECK:           %[[MAP:.*]] = omp.map_info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-// CHECK:           omp.target   thread_limit(%[[VAL_0]] : i32) map_entries(%[[MAP]] : !llvm.ptr) {
+// CHECK:           %[[MAP1:.*]] = omp.map_info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+// CHECK:           %[[MAP2:.*]] = omp.map_info var_ptr(%[[ARG_1]] : !llvm.ptr, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = ""}
+// CHECK:           omp.target   thread_limit(%[[VAL_0]] : i32) map_entries(%[[MAP1]] -> %[[BB_ARG0:.*]], %[[MAP2]] -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) {
+// CHECK:           ^bb0(%[[BB_ARG0]]: !llvm.ptr, %[[BB_ARG1]]: !llvm.ptr):
 // CHECK:             %[[VAL_1:.*]] = llvm.mlir.constant(10 : i32) : i32
-// CHECK:             llvm.store %[[VAL_1]], %[[ARG_1]] : i32, !llvm.ptr
+// CHECK:             llvm.store %[[VAL_1]], %[[BB_ARG1]] : i32, !llvm.ptr
 // CHECK:             omp.terminator
 // CHECK:           }
 // CHECK:           llvm.return
@@ -256,9 +258,11 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr, %i : !llvm.ptr) {
 llvm.func @_QPomp_target(%a : !llvm.ptr, %i : !llvm.ptr) {
   %0 = llvm.mlir.constant(64 : i32) : i32
   %1 = omp.map_info var_ptr(%a : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target   thread_limit(%0 : i32) map_entries(%1 : !llvm.ptr) {
+  %3 = omp.map_info var_ptr(%i : !llvm.ptr, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = ""}
+  omp.target   thread_limit(%0 : i32) map_entries(%1 -> %arg0, %3 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
     %2 = llvm.mlir.constant(10 : i32) : i32
-    llvm.store %2, %i : i32, !llvm.ptr
+    llvm.store %2, %arg1 : i32, !llvm.ptr
     omp.terminator
   }
   llvm.return
@@ -449,7 +453,8 @@ llvm.func @sub_() {
 // CHECK: %[[C_14:.*]] = llvm.mlir.constant(1 : index) : i64
 // CHECK: %[[BOUNDS1:.*]] = omp.bounds   lower_bound(%[[C_12]] : i64) upper_bound(%[[C_11]] : i64) stride(%[[C_14]] : i64) start_idx(%[[C_14]] : i64)
 // CHECK: %[[MAP1:.*]] = omp.map_info var_ptr(%[[ARG_2]] : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""}
-// CHECK: omp.target   map_entries(%[[MAP0]], %[[MAP1]] : !llvm.ptr, !llvm.ptr) {
+// CHECK: omp.target   map_entries(%[[MAP0]] -> %[[BB_ARG0:.*]], %[[MAP1]]  -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) {
+// CHECK: ^bb0(%[[BB_ARG0]]: !llvm.ptr, %[[BB_ARG1]]: !llvm.ptr):
 // CHECK:   omp.terminator
 // CHECK: }
 // CHECK: llvm.return
@@ -468,7 +473,8 @@ llvm.func @_QPtarget_map_with_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2:
   %9 = llvm.mlir.constant(1 : index) : i64
   %10 = omp.bounds   lower_bound(%7 : i64) upper_bound(%6 : i64) stride(%9 : i64) start_idx(%9 : i64)
   %11 = omp.map_info var_ptr(%arg2 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%10) -> !llvm.ptr {name = ""}
-  omp.target   map_entries(%5, %11 : !llvm.ptr, !llvm.ptr) {
+  omp.target   map_entries(%5 -> %arg3, %11 -> %arg4: !llvm.ptr, !llvm.ptr) {
+    ^bb0(%arg3: !llvm.ptr, %arg4: !llvm.ptr):
     omp.terminator
   }
   llvm.return
diff --git a/mlir/test/Dialect/OpenMP/canonicalize.mlir b/mlir/test/Dialect/OpenMP/canonicalize.mlir
index 8aff8f81188be5..de6c931ecc5fd9 100644
--- a/mlir/test/Dialect/OpenMP/canonicalize.mlir
+++ b/mlir/test/Dialect/OpenMP/canonicalize.mlir
@@ -131,8 +131,9 @@ func.func private @foo() -> ()
 
 func.func @constant_hoisting_target(%x : !llvm.ptr) {
   omp.target {
+    ^bb0(%arg0: !llvm.ptr):
     %c1 = arith.constant 10 : i32
-    llvm.store %c1, %x : i32, !llvm.ptr
+    llvm.store %c1, %arg0 : i32, !llvm.ptr
     omp.terminator
   }
   return
@@ -141,4 +142,4 @@ func.func @constant_hoisting_target(%x : !llvm.ptr) {
 // CHECK-LABEL: func.func @constant_hoisting_target
 // CHECK-NOT: arith.constant
 // CHECK: omp.target
-// CHECK-NEXT: arith.constant
+// CHECK: arith.constant
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 6f75f2a62e6413..42e9fb1c64baec 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -1617,7 +1617,9 @@ func.func @omp_threadprivate() {
 func.func @omp_target(%map1: memref<?xi32>) {
   %mapv = omp.map_info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(delete) capture(ByRef) -> memref<?xi32> {name = ""}
   // expected-error @below {{to, from, tofrom and alloc map types are permitted}}
-  omp.target map_entries(%mapv : memref<?xi32>){}
+  omp.target map_entries(%mapv -> %arg0: memref<?xi32>) {
+    ^bb0(%arg0: memref<?xi32>):
+  }
   return
 }
 
@@ -1656,4 +1658,40 @@ func.func @omp_target_exit_data(%map1: memref<?xi32>) {
   return
 }
 
+// -----
+
+func.func @omp_map1(%map1: memref<?xi32>, %map2: i32) {
+  %mapv = omp.map_info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>) val(%map2 : i32)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
+  // expected-error @below {{only one of val or var_ptr must be used}}
+  omp.target map_entries(%mapv -> %arg0: memref<?xi32>) {
+    ^bb0(%arg0: memref<?xi32>):
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+func.func @omp_map2(%map1: memref<?xi32>, %map2: i32) {
+  %mapv = omp.map_info var_ptr( : , tensor<?xi32>) val(%map2 : i32)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
+  // expected-error @below {{var_type supplied without var_ptr}}
+  omp.target map_entries(%mapv -> %arg0: memref<?xi32>) {
+    ^bb0(%arg0: memref<?xi32>):
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+func.func @omp_map3(%map1: memref<?xi32>, %map2: i32) {
+  %mapv = omp.map_info   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
+  // expected-error @below {{missing val or var_ptr}}
+  omp.target map_entries(%mapv -> %arg0: memref<?xi32>) {
+    ^bb0(%arg0: memref<?xi32>):
+    omp.terminator
+  }
+  return
+}
+
 llvm.mlir.global internal @_QFsubEx() : i32
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index d59a4f428118bf..4d88d9ac86fe16 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -492,16 +492,22 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %map1:
     // Test with optional map clause.
     // CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
     // CHECK: %[[MAP_B:.*]] = omp.map_info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target map_entries(%[[MAP_A]], %[[MAP_B]] : memref<?xi32>, memref<?xi32>) {
+    // CHECK: omp.target map_entries(%[[MAP_A]] -> {{.*}}, %[[MAP_B]] -> {{.*}} : memref<?xi32>, memref<?xi32>) {
     %mapv1 = omp.map_info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
     %mapv2 = omp.map_info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
-    omp.target map_entries(%mapv1, %mapv2 : memref<?xi32>, memref<?xi32>){}
+    omp.target map_entries(%mapv1 -> %arg0, %mapv2 -> %arg1 : memref<?xi32>, memref<?xi32>) {
+    ^bb0(%arg0: memref<?xi32>, %arg1: memref<?xi32>):
+      omp.terminator
+    }
     // CHECK: %[[MAP_C:.*]] = omp.map_info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(to) capture(ByRef) -> memref<?xi32> {name = ""}
     // CHECK: %[[MAP_D:.*]] = omp.map_info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(always, from) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target map_entries(%[[MAP_C]], %[[MAP_D]] : memref<?xi32>, memref<?xi32>) {
+    // CHECK: omp.target map_entries(%[[MAP_C]] -> {{.*}}, %[[MAP_D]] -> {{.*}} : memref<?xi32>, memref<?xi32>) {
     %mapv3 = omp.map_info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(to) capture(ByRef) -> memref<?xi32> {name = ""}
     %mapv4 = omp.map_info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(always, from) capture(ByRef) -> memref<?xi32> {name = ""}
-    omp.target map_entries(%mapv3, %mapv4 : memref<?xi32>, memref<?xi32>) {}
+    omp.target map_entries(%mapv3 -> %arg0, %mapv4 -> %arg1 : memref<?xi32>, memref<?xi32>) {
+    ^bb0(%arg0: memref<?xi32>, %arg1: memref<?xi32>):
+      omp.terminator
+    }
     // CHECK: omp.barrier
     omp.barrier
 
@@ -2055,8 +2061,11 @@ func.func @omp_targets_with_map_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> ()
     %10 = omp.bounds   lower_bound(%7 : i64) upper_bound(%6 : i64) stride(%8 : i64) start_idx(%9 : i64)
     %mapv2 = omp.map_info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByCopy) bounds(%10) -> !llvm.ptr {name = ""}
 
-    // CHECK: omp.target map_entries(%[[MAP0]], %[[MAP1]] : !llvm.ptr, !llvm.ptr)
-    omp.target map_entries(%mapv1, %mapv2 : !llvm.ptr, !llvm.ptr){}
+    // CHECK: omp.target map_entries(%[[MAP0]] -> {{.*}}, %[[MAP1]] -> {{.*}} : !llvm.ptr, !llvm.ptr)
+    omp.target map_entries(%mapv1 -> %arg2, %mapv2 -> %arg3 : !llvm.ptr, !llvm.ptr) {
+    ^bb0(%arg2: !llvm.ptr, %arg3: !llvm.ptr):
+      omp.terminator
+    }
 
     // CHECK: omp.target_data map_entries(%[[MAP0]], %[[MAP1]] : !llvm.ptr, !llvm.ptr)
     omp.target_data map_entries(%mapv1, %mapv2 : !llvm.ptr, !llvm.ptr){}
diff --git a/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir b/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
index 056085123480ba..307d8a02ce61da 100644
--- a/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
@@ -1,10 +1,10 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
 // This test checks the offload sizes provided to the OpenMP kernel argument
-// structure are correct when lowering to LLVM-IR from MLIR with 3-D bounds 
-// provided for a 3-D array. One with full default size, and the other with 
-// a user specified OpenMP array sectioning. We expect the default sized 
-// array bounds to lower to the full size of the array and the sectioned 
+// structure are correct when lowering to LLVM-IR from MLIR with 3-D bounds
+// provided for a 3-D array. One with full default size, and the other with
+// a user specified OpenMP array sectioning. We expect the default sized
+// array bounds to lower to the full size of the array and the sectioned
 // array to be the size of 3*3*1*element-byte-size (36 bytes in this case).
 
 module attributes {omp.is_target_device = false} {
@@ -18,12 +18,13 @@ module attributes {omp.is_target_device = false} {
     %6 = omp.bounds   lower_bound(%2 : i64) upper_bound(%2 : i64) stride(%2 : i64) start_idx(%2 : i64)
     %7 = omp.map_info var_ptr(%0 : !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>)   map_clauses(tofrom) capture(ByRef) bounds(%5, %5, %6) -> !llvm.ptr {name = "inarray(1:3,1:3,2:2)"}
     %8 = omp.map_info var_ptr(%1 : !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>)   map_clauses(tofrom) capture(ByRef) bounds(%5, %5, %5) -> !llvm.ptr {name = "outarray(1:3,1:3,1:3)"}
-    omp.target   map_entries(%7, %8 : !llvm.ptr, !llvm.ptr) {
+    omp.target   map_entries(%7 -> %arg0, %8 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+      ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
       %9 = llvm.mlir.constant(0 : i64) : i64
       %10 = llvm.mlir.constant(1 : i64) : i64
-      %11 = llvm.getelementptr %0[0, %10, %9, %9] : (!llvm.ptr, i64, i64, i64) -> !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>
+      %11 = llvm.getelementptr %arg0[0, %10, %9, %9] : (!llvm.ptr, i64, i64, i64) -> !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>
       %12 = llvm.load %11 : !llvm.ptr -> i32
-      %13 = llvm.getelementptr %1[0, %10, %9, %9] : (!llvm.ptr, i64, i64, i64) -> !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>
+      %13 = llvm.getelementptr %arg1[0, %10, %9, %9] : (!llvm.ptr, i64, i64, i64) -> !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>
       llvm.store %12, %13 : i32, !llvm.ptr
       omp.terminator
     }
diff --git a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
index c0c8640bb30bda..875d04f584ca96 100644
--- a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
@@ -6,9 +6,10 @@ module attributes {omp.is_target_device = true} {
     %1 = llvm.mlir.addressof @_QFEsp : !llvm.ptr
     %2 = omp.map_info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "sp"}
     %3 = omp.map_info var_ptr(%0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr {name = "i"}
-    omp.target map_entries(%2, %3 : !llvm.ptr, !llvm.ptr) {
-      %4 = llvm.load %0 : !llvm.ptr -> i32
-      llvm.store %4, %1 : i32, !llvm.ptr
+    omp.target map_entries(%2 -> %arg0, %3 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+      ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+      %4 = llvm.load %arg1 : !llvm.ptr -> i32
+      llvm.store %4, %arg0 : i32, !llvm.ptr
       omp.terminator
     }
     llvm.return
@@ -32,7 +33,7 @@ module attributes {omp.is_target_device = true} {
 // CHECK: store ptr %[[ARG_BYCOPY]], ptr %[[ALLOCA_BYCOPY]], align 8
 
 // CHECK: user_code.entry:                                  ; preds = %entry
-// CHECK: %[[LOAD_BYREF:.*]] = load ptr, ptr %[[ALLOCA_BYREF]], align 8 
+// CHECK: %[[LOAD_BYREF:.*]] = load ptr, ptr %[[ALLOCA_BYREF]], align 8
 // CHECK: br label %omp.target
 
 // CHECK: omp.target:                                       ; preds = %user_code.entry
diff --git a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir
index ca5dad8b4fc9a8..c8fb4e232f06f5 100644
--- a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir
@@ -6,9 +6,10 @@ module attributes {omp.is_target_device = false} {
     %1 = llvm.mlir.addressof @_QFEsp : !llvm.ptr
     %2 = omp.map_info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "sp"}
     %3 = omp.map_info var_ptr(%0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr {name = "i"}
-    omp.target map_entries(%2, %3 : !llvm.ptr, !llvm.ptr) {
-      %4 = llvm.load %0 : !llvm.ptr -> i32
-      llvm.store %4, %1 : i32, !llvm.ptr
+    omp.target map_entries(%2 -> %arg0, %3 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+      ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+      %4 = llvm.load %arg1 : !llvm.ptr -> i32
+      llvm.store %4, %arg0 : i32, !llvm.ptr
       omp.terminator
     }
     llvm.return
diff --git a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
index 24795cf70c009e..cf08761981fb3a 100644
--- a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
@@ -1,10 +1,10 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
 // This test the generation of additional load operations for declare target link variables
-// inside of target op regions when lowering to IR for device. Unfortunately as the host file is not 
+// inside of target op regions when lowering to IR for device. Unfortunately as the host file is not
 // passed as a module attribute, we miss out on the metadata and entryinfo.
 //
-// Unfortunately, only so much can be tested as the device side is dependent on a *.bc 
+// Unfortunately, only so much can be tested as the device side is dependent on a *.bc
 // file created by the host and appended as an attribute to the module.
 
 module attributes {omp.is_target_device = true} {
@@ -13,18 +13,19 @@ module attributes {omp.is_target_device = true} {
     %0 = llvm.mlir.constant(0 : i32) : i32
     llvm.return %0 : i32
   }
-                                                            
+
   llvm.func @_QQmain() attributes {} {
     %0 = llvm.mlir.addressof @_QMtest_0Esp : !llvm.ptr
-  
+
   // CHECK-DAG:   omp.target:                                       ; preds = %user_code.entry
   // CHECK-DAG: %[[V:.*]] = load ptr, ptr @_QMtest_0Esp_decl_tgt_ref_ptr, align 8
   // CHECK-DAG: store i32 1, ptr %[[V]], align 4
   // CHECK-DAG: br label %omp.region.cont
     %map = omp.map_info var_ptr(%0 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-    omp.target   map_entries(%map : !llvm.ptr) {
+    omp.target   map_entries(%map -> %arg0 : !llvm.ptr) {
+      ^bb0(%arg0: !llvm.ptr):
       %1 = llvm.mlir.constant(1 : i32) : i32
-      llvm.store %1, %0 : i32, !llvm.ptr
+      llvm.store %1, %arg0 : i32, !llvm.ptr
       omp.terminator
     }
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
index bd399ad935259c..78bab6ece73e6b 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
@@ -15,11 +15,12 @@ module attributes {omp.is_target_device = true} {
     %map1 = omp.map_info var_ptr(%3 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     %map2 = omp.map_info var_ptr(%5 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     %map3 = omp.map_info var_ptr(%7 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-    omp.target map_entries(%map1, %map2, %map3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-      %8 = llvm.load %3 : !llvm.ptr -> i32
-      %9 = llvm.load %5 : !llvm.ptr -> i32
+    omp.target map_entries(%map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
+      %8 = llvm.load %arg0 : !llvm.ptr -> i32
+      %9 = llvm.load %arg1 : !llvm.ptr -> i32
       %10 = llvm.add %8, %9  : i32
-      llvm.store %10, %7 : i32, !llvm.ptr
+      llvm.store %10, %arg2 : i32, !llvm.ptr
       omp.terminator
     }
     llvm.return
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-llvm-target-device.mlir b/mlir/test/Target/LLVMIR/omptarget-region-llvm-target-device.mlir
index 2cd0331087ec01..6fa039f522e206 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-llvm-target-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-llvm-target-device.mlir
@@ -3,10 +3,11 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
 module attributes {omp.is_target_device = true} {
-  llvm.func @writeindex_omp_outline_0_(%arg0: !llvm.ptr, %arg1: !llvm.ptr) attributes {omp.outline_parent_name = "writeindex_"} {
-    %0 = omp.map_info var_ptr(%arg0 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-    %1 = omp.map_info var_ptr(%arg1 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-    omp.target   map_entries(%0, %1 : !llvm.ptr, !llvm.ptr) {
+  llvm.func @writeindex_omp_outline_0_(%val0: !llvm.ptr, %val1: !llvm.ptr) attributes {omp.outline_parent_name = "writeindex_"} {
+    %0 = omp.map_info var_ptr(%val0 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    %1 = omp.map_info var_ptr(%val1 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target   map_entries(%0 -> %arg0, %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
       %2 = llvm.mlir.constant(20 : i32) : i32
       %3 = llvm.mlir.constant(10 : i32) : i32
       llvm.store %3, %arg0 : i32, !llvm.ptr
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
index 4e89b8585c7525..b861dd7a7d315f 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
@@ -15,11 +15,12 @@ module attributes {omp.is_target_device = false} {
     %map1 = omp.map_info var_ptr(%3 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     %map2 = omp.map_info var_ptr(%5 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     %map3 = omp.map_info var_ptr(%7 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-    omp.target map_entries(%map1, %map2, %map3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
-      %8 = llvm.load %3 : !llvm.ptr -> i32
-      %9 = llvm.load %5 : !llvm.ptr -> i32
+    omp.target map_entries(%map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
+      %8 = llvm.load %arg0 : !llvm.ptr -> i32
+      %9 = llvm.load %arg1 : !llvm.ptr -> i32
       %10 = llvm.add %8, %9  : i32
-      llvm.store %10, %7 : i32, !llvm.ptr
+      llvm.store %10, %arg2 : i32, !llvm.ptr
       omp.terminator
     }
     llvm.return
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
index 1d8799ecd446f0..c80ea1f0a47be7 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
@@ -15,12 +15,13 @@ module attributes {omp.is_target_device = false} {
     %map1 = omp.map_info var_ptr(%3 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     %map2 = omp.map_info var_ptr(%5 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     %map3 = omp.map_info var_ptr(%7 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-    omp.target map_entries( %map1, %map2, %map3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+    omp.target map_entries( %map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
       omp.parallel {
-        %8 = llvm.load %3 : !llvm.ptr -> i32
-        %9 = llvm.load %5 : !llvm.ptr -> i32
+        %8 = llvm.load %arg0 : !llvm.ptr -> i32
+        %9 = llvm.load %arg1 : !llvm.ptr -> i32
         %10 = llvm.add %8, %9  : i32
-        llvm.store %10, %7 : i32, !llvm.ptr
+        llvm.store %10, %arg2 : i32, !llvm.ptr
         omp.terminator
         }
       omp.terminator

From 63752399f8323460f685c8431aae7269ea3f27a2 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee@amd.com>
Date: Mon, 25 Sep 2023 12:31:30 +0100
Subject: [PATCH 13/40] [OpenMP][MLIR]OMPEarlyOutliningPass removal

This patch removes the OMPEarlyOutliningPass as it is no longer required. The implicit map operand capture has now been moved to the PFT lowering stage.

Depends on #67318.
---
 .../flang/Optimizer/Transforms/Passes.h       |   2 -
 .../flang/Optimizer/Transforms/Passes.td      |  12 -
 flang/include/flang/Tools/CLOptions.inc       |   4 +-
 flang/lib/Optimizer/Transforms/CMakeLists.txt |   1 -
 .../Transforms/OMPEarlyOutlining.cpp          | 303 ------------------
 .../OpenMP/FIR/omp-target-early-outlining.f90 |  89 -----
 .../Lower/OpenMP/function-filtering-2.f90     |   2 -
 .../test/Lower/OpenMP/function-filtering.f90  |   5 -
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   2 +
 9 files changed, 3 insertions(+), 417 deletions(-)
 delete mode 100644 flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp
 delete mode 100644 flang/test/Lower/OpenMP/FIR/omp-target-early-outlining.f90

diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 30d97be3800c19..92bc7246eca700 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -75,8 +75,6 @@ std::unique_ptr<mlir::Pass>
 createAlgebraicSimplificationPass(const mlir::GreedyRewriteConfig &config);
 std::unique_ptr<mlir::Pass> createPolymorphicOpConversionPass();
 
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createOMPEarlyOutliningPass();
 std::unique_ptr<mlir::Pass> createOMPFunctionFilteringPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 createOMPMarkDeclareTargetPass();
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 6d211a535b53f7..6e23b87b7e276e 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -318,18 +318,6 @@ def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> {
   let dependentDialects = [ "fir::FIROpsDialect" ];
 }
 
-def OMPEarlyOutliningPass
-    : Pass<"omp-early-target-outlining", "mlir::ModuleOp"> {
-  let summary = "Outlines all target ops into separate functions";
-  let description = [{
-    This pass outlines all omp.target operations into individual functions.
-    It is invoked in the front end after the initial FIR has been constructed.
-    This pass is only needed when compiling for the target device to prevent
-    the optimizer to perform transforms across target region boundaries.
-  }];
-  let constructor = "::fir::createOMPEarlyOutliningPass()";
-}
-
 def OMPMarkDeclareTargetPass
     : Pass<"omp-mark-declare-target", "mlir::ModuleOp"> {
   let summary = "Marks all functions called by an OpenMP declare target function as declare target";
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 2ed716382feb43..c452c023b4a80c 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -274,10 +274,8 @@ inline void createHLFIRToFIRPassPipeline(
 inline void createOpenMPFIRPassPipeline(
     mlir::PassManager &pm, bool isTargetDevice) {
   pm.addPass(fir::createOMPMarkDeclareTargetPass());
-  if (isTargetDevice) {
-    pm.addPass(fir::createOMPEarlyOutliningPass());
+  if (isTargetDevice)
     pm.addPass(fir::createOMPFunctionFilteringPass());
-  }
 }
 
 #if !defined(FLANG_EXCLUDE_CODEGEN)
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 98314fa7a2087f..03b67104a93b57 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -17,7 +17,6 @@ add_flang_library(FIRTransforms
   AddDebugFoundation.cpp
   PolymorphicOpConversion.cpp
   LoopVersioning.cpp
-  OMPEarlyOutlining.cpp
   OMPFunctionFiltering.cpp
   OMPMarkDeclareTarget.cpp
   VScaleAttr.cpp
diff --git a/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp b/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp
deleted file mode 100644
index 92fbdd0bbf5d4a..00000000000000
--- a/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-#include "flang/Optimizer/Dialect/FIRDialect.h"
-#include "flang/Optimizer/Dialect/FIROps.h"
-#include "flang/Optimizer/Dialect/FIRType.h"
-#include "flang/Optimizer/HLFIR/HLFIROps.h"
-#include "flang/Optimizer/Support/InternalNames.h"
-#include "flang/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
-#include "mlir/IR/BuiltinDialect.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/SymbolTable.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/RegionUtils.h"
-#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
-
-namespace fir {
-#define GEN_PASS_DEF_OMPEARLYOUTLININGPASS
-#include "flang/Optimizer/Transforms/Passes.h.inc"
-} // namespace fir
-
-namespace {
-class OMPEarlyOutliningPass
-    : public fir::impl::OMPEarlyOutliningPassBase<OMPEarlyOutliningPass> {
-
-  std::string getOutlinedFnName(llvm::StringRef parentName, unsigned count) {
-    return std::string(parentName) + "_omp_outline_" + std::to_string(count);
-  }
-
-  // Given a value this function will iterate over an operators results
-  // and return the relevant index for the result the value corresponds to.
-  // There may be a simpler way to do this however.
-  static unsigned getResultIndex(mlir::Value value, mlir::Operation *op) {
-    for (unsigned i = 0; i < op->getNumResults(); ++i) {
-      if (op->getResult(i) == value)
-        return i;
-    }
-    return 0;
-  }
-
-  static bool isAddressOfGlobalDeclareTarget(mlir::Value value) {
-    if (fir::AddrOfOp addressOfOp =
-            mlir::dyn_cast_if_present<fir::AddrOfOp>(value.getDefiningOp()))
-      if (fir::GlobalOp gOp = mlir::dyn_cast_if_present<fir::GlobalOp>(
-              addressOfOp->getParentOfType<mlir::ModuleOp>().lookupSymbol(
-                  addressOfOp.getSymbol())))
-        if (auto declareTargetGlobal =
-                llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
-                    gOp.getOperation()))
-          if (declareTargetGlobal.isDeclareTarget())
-            return true;
-    return false;
-  }
-
-  // Currently used for cloning arguments that are nested. Should be
-  // extendable where required, perhaps via operation
-  // specialisation/overloading, if something needs specialised handling.
-  // NOTE: Results in duplication of some values that would otherwise be
-  // a single SSA value shared between operations, this is tidied up on
-  // lowering to some extent.
-  static mlir::Operation *
-  cloneArgAndChildren(mlir::OpBuilder &builder, mlir::Operation *op,
-                      llvm::SetVector<mlir::Value> &inputs,
-                      mlir::Block::BlockArgListType &newInputs) {
-    mlir::IRMapping valueMap;
-    for (mlir::Value opValue : op->getOperands()) {
-      if (opValue.getDefiningOp()) {
-        unsigned resIdx = getResultIndex(opValue, opValue.getDefiningOp());
-        valueMap.map(opValue,
-                     cloneArgAndChildren(builder, opValue.getDefiningOp(),
-                                         inputs, newInputs)
-                         ->getResult(resIdx));
-      } else {
-        for (auto inArg : llvm::zip(inputs, newInputs)) {
-          if (opValue == std::get<0>(inArg))
-            valueMap.map(opValue, std::get<1>(inArg));
-        }
-      }
-    }
-
-    return builder.clone(*op, valueMap);
-  }
-
-  static void cloneMapOpVariables(mlir::OpBuilder &builder,
-                                  mlir::IRMapping &valueMap,
-                                  mlir::IRMapping &mapInfoMap,
-                                  llvm::SetVector<mlir::Value> &inputs,
-                                  mlir::Block::BlockArgListType &newInputs,
-                                  mlir::Value varPtr) {
-    if (fir::BoxAddrOp boxAddrOp =
-            mlir::dyn_cast_if_present<fir::BoxAddrOp>(varPtr.getDefiningOp())) {
-      mlir::Value newV =
-          cloneArgAndChildren(builder, boxAddrOp, inputs, newInputs)
-              ->getResult(0);
-      mapInfoMap.map(varPtr, newV);
-      valueMap.map(boxAddrOp, newV);
-      return;
-    }
-
-    // Clone into the outlined function all hlfir.declare ops that define inputs
-    // to the target region and set up remapping of its inputs and outputs.
-    if (auto declareOp = mlir::dyn_cast_if_present<hlfir::DeclareOp>(
-            varPtr.getDefiningOp())) {
-      auto clone = llvm::cast<hlfir::DeclareOp>(
-          cloneArgAndChildren(builder, declareOp, inputs, newInputs));
-      mlir::Value newBase = clone.getBase();
-      mlir::Value newOrigBase = clone.getOriginalBase();
-      mapInfoMap.map(varPtr, newOrigBase);
-      valueMap.map(declareOp.getBase(), newBase);
-      valueMap.map(declareOp.getOriginalBase(), newOrigBase);
-      return;
-    }
-
-    if (isAddressOfGlobalDeclareTarget(varPtr)) {
-      fir::AddrOfOp addrOp =
-          mlir::dyn_cast<fir::AddrOfOp>(varPtr.getDefiningOp());
-      mlir::Value newV = builder.clone(*addrOp)->getResult(0);
-      mapInfoMap.map(varPtr, newV);
-      valueMap.map(addrOp, newV);
-      return;
-    }
-
-    for (auto inArg : llvm::zip(inputs, newInputs)) {
-      if (varPtr == std::get<0>(inArg))
-        mapInfoMap.map(varPtr, std::get<1>(inArg));
-    }
-  }
-
-  mlir::func::FuncOp outlineTargetOp(mlir::OpBuilder &builder,
-                                     mlir::omp::TargetOp &targetOp,
-                                     mlir::func::FuncOp &parentFunc,
-                                     unsigned count) {
-    // NOTE: once implicit captures are handled appropriately in the initial
-    // PFT lowering if it is possible, we can remove the usage of
-    // getUsedValuesDefinedAbove and instead just iterate over the target op's
-    // operands (or just the map arguments) and perhaps refactor this function
-    // a little.
-    // Collect inputs
-    llvm::SetVector<mlir::Value> inputs;
-    mlir::Region &targetRegion = targetOp.getRegion();
-    mlir::getUsedValuesDefinedAbove(targetRegion, inputs);
-
-    // Collect all map info. Even non-used maps must be collected to avoid ICEs.
-    for (mlir::Value oper : targetOp->getOperands()) {
-      if (auto mapEntry =
-              mlir::dyn_cast<mlir::omp::MapInfoOp>(oper.getDefiningOp())) {
-        if (!inputs.contains(mapEntry.getVarPtr()))
-          inputs.insert(mapEntry.getVarPtr());
-      }
-    }
-
-    // Filter out declare-target and map entries which are specially handled
-    // at the moment, so we do not wish these to end up as function arguments
-    // which would just be more noise in the IR.
-    llvm::SmallVector<mlir::Value> blockArgs;
-    for (llvm::SetVector<mlir::Value>::iterator iter = inputs.begin(); iter != inputs.end();) {
-      if (mlir::isa_and_nonnull<mlir::omp::MapInfoOp>(iter->getDefiningOp()) ||
-          isAddressOfGlobalDeclareTarget(*iter)) {
-        iter = inputs.erase(iter);
-      } else if (auto declareOp = mlir::dyn_cast_if_present<hlfir::DeclareOp>(
-                     iter->getDefiningOp())) {
-        // Gather hlfir.declare arguments to be added later, after the
-        // hlfir.declare operation itself has been removed as an input.
-        blockArgs.push_back(declareOp.getMemref());
-        if (mlir::Value shape = declareOp.getShape())
-          blockArgs.push_back(shape);
-        for (mlir::Value typeParam : declareOp.getTypeparams())
-          blockArgs.push_back(typeParam);
-        iter = inputs.erase(iter);
-      } else {
-        ++iter;
-      }
-    }
-
-    // Add function arguments to the list of inputs if they are used by an
-    // hlfir.declare operation.
-    for (mlir::Value arg : blockArgs) {
-      if (!arg.getDefiningOp() && !inputs.contains(arg))
-        inputs.insert(arg);
-    }
-
-    // Create new function and initialize
-    mlir::FunctionType funcType = builder.getFunctionType(
-        mlir::TypeRange(inputs.getArrayRef()), mlir::TypeRange());
-    std::string parentName(parentFunc.getName());
-    std::string funcName = getOutlinedFnName(parentName, count);
-    mlir::Location loc = targetOp.getLoc();
-    mlir::func::FuncOp newFunc =
-        mlir::func::FuncOp::create(loc, funcName, funcType);
-    mlir::Block *entryBlock = newFunc.addEntryBlock();
-    builder.setInsertionPointToStart(entryBlock);
-    mlir::Block::BlockArgListType newInputs = entryBlock->getArguments();
-
-    // Set the declare target information, the outlined function
-    // is always a host function.
-    if (auto parentDTOp = llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
-            parentFunc.getOperation()))
-      if (auto newDTOp = llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
-              newFunc.getOperation()))
-        newDTOp.setDeclareTarget(mlir::omp::DeclareTargetDeviceType::host,
-                                 parentDTOp.getDeclareTargetCaptureClause());
-
-    // Set the early outlining interface parent name
-    if (auto earlyOutlineOp =
-            llvm::dyn_cast<mlir::omp::EarlyOutliningInterface>(
-                newFunc.getOperation()))
-      earlyOutlineOp.setParentName(parentName);
-
-    // The value map for the newly generated Target Operation, we must
-    // remap most of the input.
-    mlir::IRMapping valueMap;
-
-    // Special handling for map, declare target and regular map variables
-    // are handled slightly differently for the moment, declare target has
-    // its addressOfOp cloned over, whereas we skip it for the regular map
-    // variables. We need knowledge of which global is linked to the map
-    // operation for declare target, whereas we aren't bothered for the
-    // regular map variables for the moment. We could treat both the same,
-    // however, cloning across the minimum for the moment to avoid
-    // optimisations breaking segments of the lowering seems prudent as this
-    // was the original intent of the pass.
-    for (mlir::Value oper : targetOp->getOperands()) {
-      if (auto mapEntry =
-              mlir::dyn_cast<mlir::omp::MapInfoOp>(oper.getDefiningOp())) {
-        mlir::IRMapping mapInfoMap;
-        for (mlir::Value bound : mapEntry.getBounds()) {
-          if (auto mapEntryBound = mlir::dyn_cast<mlir::omp::DataBoundsOp>(
-                  bound.getDefiningOp())) {
-            mapInfoMap.map(bound, cloneArgAndChildren(builder, mapEntryBound,
-                                                      inputs, newInputs)
-                                      ->getResult(0));
-          }
-        }
-
-        cloneMapOpVariables(builder, valueMap, mapInfoMap, inputs, newInputs,
-                            mapEntry.getVarPtr());
-
-        if (mapEntry.getVarPtrPtr())
-          cloneMapOpVariables(builder, valueMap, mapInfoMap, inputs, newInputs,
-                              mapEntry.getVarPtrPtr());
-
-        valueMap.map(
-            mapEntry,
-            builder.clone(*mapEntry.getOperation(), mapInfoMap)->getResult(0));
-      }
-    }
-
-    for (auto inArg : llvm::zip(inputs, newInputs))
-      valueMap.map(std::get<0>(inArg), std::get<1>(inArg));
-
-    // Clone the target op into the new function
-    builder.clone(*(targetOp.getOperation()), valueMap);
-
-    // Create return op
-    builder.create<mlir::func::ReturnOp>(loc);
-
-    return newFunc;
-  }
-
-  // Returns true if a target region was found in the function.
-  bool outlineTargetOps(mlir::OpBuilder &builder,
-                        mlir::func::FuncOp &functionOp,
-                        mlir::ModuleOp &moduleOp,
-                        llvm::SmallVectorImpl<mlir::func::FuncOp> &newFuncs) {
-    unsigned count = 0;
-    for (auto TargetOp : functionOp.getOps<mlir::omp::TargetOp>()) {
-      mlir::func::FuncOp outlinedFunc =
-          outlineTargetOp(builder, TargetOp, functionOp, count);
-      newFuncs.push_back(outlinedFunc);
-      count++;
-    }
-    return count > 0;
-  }
-
-  void runOnOperation() override {
-    mlir::ModuleOp moduleOp = getOperation();
-    mlir::MLIRContext *context = &getContext();
-    mlir::OpBuilder builder(context);
-    llvm::SmallVector<mlir::func::FuncOp> newFuncs;
-
-    for (auto functionOp :
-         llvm::make_early_inc_range(moduleOp.getOps<mlir::func::FuncOp>())) {
-      bool outlined = outlineTargetOps(builder, functionOp, moduleOp, newFuncs);
-      if (outlined)
-        functionOp.erase();
-    }
-
-    for (auto newFunc : newFuncs)
-      moduleOp.push_back(newFunc);
-  }
-};
-
-} // namespace
-
-namespace fir {
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createOMPEarlyOutliningPass() {
-  return std::make_unique<OMPEarlyOutliningPass>();
-}
-} // namespace fir
diff --git a/flang/test/Lower/OpenMP/FIR/omp-target-early-outlining.f90 b/flang/test/Lower/OpenMP/FIR/omp-target-early-outlining.f90
deleted file mode 100644
index 14bf911b3e5410..00000000000000
--- a/flang/test/Lower/OpenMP/FIR/omp-target-early-outlining.f90
+++ /dev/null
@@ -1,89 +0,0 @@
-!REQUIRES: amdgpu-registered-target
-
-!RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-!RUN: %flang_fc1 -triple x86_64-unknown-linux-gnu -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-!RUN: bbc -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s 
-!RUN: bbc -emit-fir -fopenmp -fopenmp-is-gpu -fopenmp-is-target-device %s -o - | FileCheck %s 
-
-!CHECK: func.func @_QPtarget_function
-
-!CHECK:  func.func @_QPwrite_index_omp_outline_0(%[[ARG0:.*]]: !fir.ref<i32>) attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, omp.outline_parent_name = "_QPwrite_index"} {
-!CHECK-NEXT: %[[map_info0:.*]] = omp.map_info var_ptr(%[[ARG0]]{{.*}}
-!CHECK-NEXT: omp.target  map_entries(%[[map_info0]]{{.*}} {
-!CHECK: %[[CONSTANT_VALUE_10:.*]] = arith.constant 10 : i32
-!CHECK: fir.store %[[CONSTANT_VALUE_10]] to %[[ARG0]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK-NEXT: }
-!CHECK-NEXT: return
-
-!CHECK:  func.func @_QPwrite_index_omp_outline_1(%[[ARG1:.*]]: !fir.ref<i32>) attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, omp.outline_parent_name = "_QPwrite_index"} {
-!CHECK-NEXT: %[[map_info1:.*]] = omp.map_info var_ptr(%[[ARG1]]{{.*}}
-!CHECK-NEXT: omp.target  map_entries(%[[map_info1]]{{.*}} {
-!CHECK: %[[CONSTANT_VALUE_20:.*]] = arith.constant 20 : i32
-!CHECK: fir.store %[[CONSTANT_VALUE_20]] to %[[ARG1]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK-NEXT: }
-!CHECK-NEXT: return
-
-
-SUBROUTINE WRITE_INDEX(INT_ARRAY)
-        INTEGER :: INT_ARRAY(*)
-        INTEGER :: NEW_LEN
-!$omp target map(from:new_len)
-        NEW_LEN = 10
-!$omp end target
-!$omp target map(from:new_len)
-        NEW_LEN = 20
-!$omp end target
-        do INDEX_ = 1, NEW_LEN
-                INT_ARRAY(INDEX_) = INDEX_
-        end do
-end subroutine WRITE_INDEX
-
-SUBROUTINE TARGET_FUNCTION()
-!$omp declare target
-END
-
-!CHECK: func.func @_QParray_bounds_omp_outline_0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<!fir.array<10xi32>>) attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, omp.outline_parent_name = "_QParray_bounds"} {
-!CHECK: %[[C1:.*]] = arith.constant 1 : index
-!CHECK: %[[C4:.*]] = arith.constant 4 : index
-!CHECK: %[[C1_0:.*]] = arith.constant 1 : index
-!CHECK: %[[C1_1:.*]] = arith.constant 1 : index
-!CHECK: %[[BOUNDS:.*]] = omp.bounds   lower_bound(%[[C1]] : index) upper_bound(%[[C4]] : index) stride(%[[C1_1]] : index) start_idx(%[[C1_1]] : index)
-!CHECK: %[[ENTRY:.*]] = omp.map_info var_ptr(%[[ARG1]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_write(2:5)"}
-!CHECK: omp.target   map_entries(%[[ENTRY]] : !fir.ref<!fir.array<10xi32>>) {
-!CHECK:  %c2_i32 = arith.constant 2 : i32
-!CHECK:  %2 = fir.convert %c2_i32 : (i32) -> index
-!CHECK:  %c5_i32 = arith.constant 5 : i32
-!CHECK:  %3 = fir.convert %c5_i32 : (i32) -> index
-!CHECK:  %c1_2 = arith.constant 1 : index
-!CHECK:  %4 = fir.convert %2 : (index) -> i32
-!CHECK:  %5:2 = fir.do_loop %arg2 = %2 to %3 step %c1_2 iter_args(%arg3 = %4) -> (index, i32) {
-!CHECK:    fir.store %arg3 to %[[ARG0]] : !fir.ref<i32>
-!CHECK:    %6 = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:    %7 = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:    %8 = fir.convert %7 : (i32) -> i64
-!CHECK:    %c1_i64 = arith.constant 1 : i64
-!CHECK:    %9 = arith.subi %8, %c1_i64 : i64
-!CHECK:    %10 = fir.coordinate_of %[[ARG1]], %9 : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
-!CHECK:    fir.store %6 to %10 : !fir.ref<i32>
-!CHECK:    %11 = arith.addi %arg2, %c1_2 : index
-!CHECK:    %12 = fir.convert %c1_2 : (index) -> i32
-!CHECK:    %13 = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:    %14 = arith.addi %13, %12 : i32
-!CHECK:    fir.result %11, %14 : index, i32
-!CHECK:  }
-!CHECK: fir.store %5#1 to %[[ARG0]] : !fir.ref<i32>
-!CHECK:  omp.terminator
-!CHECK: }
-!CHECK:return
-!CHECK:}
-
-SUBROUTINE ARRAY_BOUNDS()
-        INTEGER :: sp_write(10) = (/0,0,0,0,0,0,0,0,0,0/)
-!$omp target map(tofrom:sp_write(2:5))
-        do i = 2, 5
-                sp_write(i) = i
-        end do
-!$omp end target
-end subroutine ARRAY_BOUNDS
diff --git a/flang/test/Lower/OpenMP/function-filtering-2.f90 b/flang/test/Lower/OpenMP/function-filtering-2.f90
index 8219be5ad1e40c..17cd0d44c01b4b 100644
--- a/flang/test/Lower/OpenMP/function-filtering-2.f90
+++ b/flang/test/Lower/OpenMP/function-filtering-2.f90
@@ -26,9 +26,7 @@ subroutine no_declaretarget()
 end subroutine no_declaretarget
 
 ! MLIR-HOST: func.func @{{.*}}main(
-! MLIR-HOST-NOT: func.func @{{.*}}main_omp_outline{{.*}}()
 ! MLIR-DEVICE-NOT: func.func @{{.*}}main(
-! MLIR-DEVICE: func.func @{{.*}}main_omp_outline{{.*}}() attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, omp.outline_parent_name = "_QQmain"}
 ! MLIR-ALL: return
 
 ! LLVM-HOST: define {{.*}} @{{.*}}main{{.*}}(
diff --git a/flang/test/Lower/OpenMP/function-filtering.f90 b/flang/test/Lower/OpenMP/function-filtering.f90
index 3de14aa4709fc4..e550348e50692c 100644
--- a/flang/test/Lower/OpenMP/function-filtering.f90
+++ b/flang/test/Lower/OpenMP/function-filtering.f90
@@ -34,14 +34,9 @@ end function host_fn
 
 ! MLIR-HOST: func.func @{{.*}}target_subr(
 ! MLIR-HOST: return
-! MLIR-HOST-NOT: func.func @{{.*}}target_subr_omp_outline_0(
-! MLIR-DEVICE-NOT: func.func @{{.*}}target_subr(
-! MLIR-DEVICE: func.func @{{.*}}target_subr_omp_outline_0(
 ! MLIR-DEVICE: return
 
-! LLVM-ALL-NOT: define {{.*}} @{{.*}}target_subr_omp_outline_0{{.*}}(
 ! LLVM-HOST: define {{.*}} @{{.*}}target_subr{{.*}}(
-! LLVM-DEVICE-NOT: {{.*}} @{{.*}}target_subr{{.*}}(
 ! LLVM-ALL: define {{.*}} @__omp_offloading_{{.*}}_{{.*}}_target_subr__{{.*}}(
 subroutine target_subr(x)
   integer, intent(out) :: x
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8e741bf83be32f..e6eacaed152287 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2492,6 +2492,8 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute,
       if (declareType == omp::DeclareTargetDeviceType::host) {
         llvm::Function *llvmFunc =
             moduleTranslation.lookupFunction(funcOp.getName());
+        llvmFunc->replaceAllUsesWith(
+            llvm::UndefValue::get(llvmFunc->getType()));
         llvmFunc->dropAllReferences();
         llvmFunc->eraseFromParent();
       }

From be59fe5028b99c36c4bc9ccf6304216e50b337a8 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee@amd.com>
Date: Thu, 2 Nov 2023 15:45:50 +0000
Subject: [PATCH 14/40] [OpenMP][Flang]Fix some of the Fortan OpenMP Offloading
 tests

target_map_common_block2.f90
	- Fix the extra space in the print message.
	- #67164 fixes this. So moving it outside of failing and also removing XFAIL marker.

basic-target-region-3D-array.f90
	- Corrected the check to account for the new lines printed.

Depends on #67319
---
 .../fortran/basic-target-region-3D-array.f90        | 13 ++++++++++---
 .../{failing => }/target_map_common_block2.f90      |  5 ++---
 2 files changed, 12 insertions(+), 6 deletions(-)
 rename openmp/libomptarget/test/offloading/fortran/{failing => }/target_map_common_block2.f90 (85%)

diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
index 44ff394dcda16b..abc2763d4a30cc 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
@@ -23,7 +23,7 @@ program main
     end do
 
 i = 1
-j = 1 
+j = 1
 k = 1
 
 !$omp target map(tofrom:x, counter) map(to: i, j, k, i2, j2, k2)
@@ -50,5 +50,12 @@ program main
         end do
     end do
 end program main
-  
-! CHECK: 1 2 3 4 5 6 7 8
+
+! CHECK: 1
+! CHECK: 2
+! CHECK: 3
+! CHECK: 4
+! CHECK: 5
+! CHECK: 6
+! CHECK: 7
+! CHECK: 8
diff --git a/openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block2.f90 b/openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90
similarity index 85%
rename from openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block2.f90
rename to openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90
index 1f95ef7c460757..8a9c47545fbd78 100644
--- a/openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block2.f90
+++ b/openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90
@@ -7,7 +7,6 @@
 ! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
-! XFAIL: *
 
 program main
   use omp_lib
@@ -15,11 +14,11 @@ program main
   common var4
   var4 = 24
   tmp = 12
-  print *, "var4 before target = ", var4
+  print *, "var4 before target =", var4
   !$omp target map(tofrom:var4)
     var4 = tmp
   !$omp end target
-  print *, "var4 after target = ", var4
+  print *, "var4 after target =", var4
 end program
 
 ! CHECK: var4 before target = 24

From c39995a116a74ebafc63648e8f047d13012c4f87 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 6 Nov 2023 13:25:20 +0000
Subject: [PATCH 15/40] [mlir][Bazel] Adjust SPIRVTarget and add missing
 dependency.

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 41a3ef047834bd..2f92f98dd413be 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -5318,6 +5318,7 @@ cc_library(
         ":GPUDialect",
         ":GPUPassIncGen",
         ":Pass",
+        ":SPIRVDialect",
         ":Support",
         "//llvm:Support",
     ],
@@ -6529,12 +6530,12 @@ cc_library(
 cc_library(
     name = "SPIRVTarget",
     srcs = ["lib/Target/SPIRV/Target.cpp"],
-    hdrs = glob(["include/mlir/Target/SPIRV/*.h"]),
+    hdrs = ["include/mlir/Target/SPIRV/Target.h"],
     includes = ["include"],
     deps = [
         ":GPUDialect",
         ":SPIRVDialect",
-        ":Support",
+        ":SPIRVSerialization",
     ],
 )
 

From 939fd6c37c52a12cb968215cb30d1accd360338c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 6 Nov 2023 13:15:49 +0000
Subject: [PATCH 16/40] [AutoUpgrade] Use StringRef::starts_with/ends_with
 instead of startswith/endswith. NFC.

startswith/endswith wrap starts_with/ends_with and will eventually go away (to more closely match string_view)
---
 llvm/include/llvm/IR/AutoUpgrade.h |    2 +-
 llvm/lib/IR/AutoUpgrade.cpp        | 1042 ++++++++++++++--------------
 2 files changed, 522 insertions(+), 522 deletions(-)

diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index f9b5d0c843907e..152f781ffa9b30 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -90,7 +90,7 @@ namespace llvm {
 
   /// Check whether a string looks like an old loop attachment tag.
   inline bool mayBeOldLoopAttachmentTag(StringRef Name) {
-    return Name.startswith("llvm.vectorizer.");
+    return Name.starts_with("llvm.vectorizer.");
   }
 
   /// Upgrade the loop attachment metadata node.
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index e102aae52597a8..f401135127c1b2 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -128,60 +128,60 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name == "addcarry.u64" || // Added in 8.0
       Name == "subborrow.u32" || // Added in 8.0
       Name == "subborrow.u64" || // Added in 8.0
-      Name.startswith("sse2.padds.") || // Added in 8.0
-      Name.startswith("sse2.psubs.") || // Added in 8.0
-      Name.startswith("sse2.paddus.") || // Added in 8.0
-      Name.startswith("sse2.psubus.") || // Added in 8.0
-      Name.startswith("avx2.padds.") || // Added in 8.0
-      Name.startswith("avx2.psubs.") || // Added in 8.0
-      Name.startswith("avx2.paddus.") || // Added in 8.0
-      Name.startswith("avx2.psubus.") || // Added in 8.0
-      Name.startswith("avx512.padds.") || // Added in 8.0
-      Name.startswith("avx512.psubs.") || // Added in 8.0
-      Name.startswith("avx512.mask.padds.") || // Added in 8.0
-      Name.startswith("avx512.mask.psubs.") || // Added in 8.0
-      Name.startswith("avx512.mask.paddus.") || // Added in 8.0
-      Name.startswith("avx512.mask.psubus.") || // Added in 8.0
+      Name.starts_with("sse2.padds.") || // Added in 8.0
+      Name.starts_with("sse2.psubs.") || // Added in 8.0
+      Name.starts_with("sse2.paddus.") || // Added in 8.0
+      Name.starts_with("sse2.psubus.") || // Added in 8.0
+      Name.starts_with("avx2.padds.") || // Added in 8.0
+      Name.starts_with("avx2.psubs.") || // Added in 8.0
+      Name.starts_with("avx2.paddus.") || // Added in 8.0
+      Name.starts_with("avx2.psubus.") || // Added in 8.0
+      Name.starts_with("avx512.padds.") || // Added in 8.0
+      Name.starts_with("avx512.psubs.") || // Added in 8.0
+      Name.starts_with("avx512.mask.padds.") || // Added in 8.0
+      Name.starts_with("avx512.mask.psubs.") || // Added in 8.0
+      Name.starts_with("avx512.mask.paddus.") || // Added in 8.0
+      Name.starts_with("avx512.mask.psubus.") || // Added in 8.0
       Name=="ssse3.pabs.b.128" || // Added in 6.0
       Name=="ssse3.pabs.w.128" || // Added in 6.0
       Name=="ssse3.pabs.d.128" || // Added in 6.0
-      Name.startswith("fma4.vfmadd.s") || // Added in 7.0
-      Name.startswith("fma.vfmadd.") || // Added in 7.0
-      Name.startswith("fma.vfmsub.") || // Added in 7.0
-      Name.startswith("fma.vfmsubadd.") || // Added in 7.0
-      Name.startswith("fma.vfnmadd.") || // Added in 7.0
-      Name.startswith("fma.vfnmsub.") || // Added in 7.0
-      Name.startswith("avx512.mask.vfmadd.") || // Added in 7.0
-      Name.startswith("avx512.mask.vfnmadd.") || // Added in 7.0
-      Name.startswith("avx512.mask.vfnmsub.") || // Added in 7.0
-      Name.startswith("avx512.mask3.vfmadd.") || // Added in 7.0
-      Name.startswith("avx512.maskz.vfmadd.") || // Added in 7.0
-      Name.startswith("avx512.mask3.vfmsub.") || // Added in 7.0
-      Name.startswith("avx512.mask3.vfnmsub.") || // Added in 7.0
-      Name.startswith("avx512.mask.vfmaddsub.") || // Added in 7.0
-      Name.startswith("avx512.maskz.vfmaddsub.") || // Added in 7.0
-      Name.startswith("avx512.mask3.vfmaddsub.") || // Added in 7.0
-      Name.startswith("avx512.mask3.vfmsubadd.") || // Added in 7.0
-      Name.startswith("avx512.mask.shuf.i") || // Added in 6.0
-      Name.startswith("avx512.mask.shuf.f") || // Added in 6.0
-      Name.startswith("avx512.kunpck") || //added in 6.0
-      Name.startswith("avx2.pabs.") || // Added in 6.0
-      Name.startswith("avx512.mask.pabs.") || // Added in 6.0
-      Name.startswith("avx512.broadcastm") || // Added in 6.0
+      Name.starts_with("fma4.vfmadd.s") || // Added in 7.0
+      Name.starts_with("fma.vfmadd.") || // Added in 7.0
+      Name.starts_with("fma.vfmsub.") || // Added in 7.0
+      Name.starts_with("fma.vfmsubadd.") || // Added in 7.0
+      Name.starts_with("fma.vfnmadd.") || // Added in 7.0
+      Name.starts_with("fma.vfnmsub.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vfmadd.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vfnmadd.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vfnmsub.") || // Added in 7.0
+      Name.starts_with("avx512.mask3.vfmadd.") || // Added in 7.0
+      Name.starts_with("avx512.maskz.vfmadd.") || // Added in 7.0
+      Name.starts_with("avx512.mask3.vfmsub.") || // Added in 7.0
+      Name.starts_with("avx512.mask3.vfnmsub.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vfmaddsub.") || // Added in 7.0
+      Name.starts_with("avx512.maskz.vfmaddsub.") || // Added in 7.0
+      Name.starts_with("avx512.mask3.vfmaddsub.") || // Added in 7.0
+      Name.starts_with("avx512.mask3.vfmsubadd.") || // Added in 7.0
+      Name.starts_with("avx512.mask.shuf.i") || // Added in 6.0
+      Name.starts_with("avx512.mask.shuf.f") || // Added in 6.0
+      Name.starts_with("avx512.kunpck") || //added in 6.0
+      Name.starts_with("avx2.pabs.") || // Added in 6.0
+      Name.starts_with("avx512.mask.pabs.") || // Added in 6.0
+      Name.starts_with("avx512.broadcastm") || // Added in 6.0
       Name == "sse.sqrt.ss" || // Added in 7.0
       Name == "sse2.sqrt.sd" || // Added in 7.0
-      Name.startswith("avx512.mask.sqrt.p") || // Added in 7.0
-      Name.startswith("avx.sqrt.p") || // Added in 7.0
-      Name.startswith("sse2.sqrt.p") || // Added in 7.0
-      Name.startswith("sse.sqrt.p") || // Added in 7.0
-      Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0
-      Name.startswith("sse2.pcmpeq.") || // Added in 3.1
-      Name.startswith("sse2.pcmpgt.") || // Added in 3.1
-      Name.startswith("avx2.pcmpeq.") || // Added in 3.1
-      Name.startswith("avx2.pcmpgt.") || // Added in 3.1
-      Name.startswith("avx512.mask.pcmpeq.") || // Added in 3.9
-      Name.startswith("avx512.mask.pcmpgt.") || // Added in 3.9
-      Name.startswith("avx.vperm2f128.") || // Added in 6.0
+      Name.starts_with("avx512.mask.sqrt.p") || // Added in 7.0
+      Name.starts_with("avx.sqrt.p") || // Added in 7.0
+      Name.starts_with("sse2.sqrt.p") || // Added in 7.0
+      Name.starts_with("sse.sqrt.p") || // Added in 7.0
+      Name.starts_with("avx512.mask.pbroadcast") || // Added in 6.0
+      Name.starts_with("sse2.pcmpeq.") || // Added in 3.1
+      Name.starts_with("sse2.pcmpgt.") || // Added in 3.1
+      Name.starts_with("avx2.pcmpeq.") || // Added in 3.1
+      Name.starts_with("avx2.pcmpgt.") || // Added in 3.1
+      Name.starts_with("avx512.mask.pcmpeq.") || // Added in 3.9
+      Name.starts_with("avx512.mask.pcmpgt.") || // Added in 3.9
+      Name.starts_with("avx.vperm2f128.") || // Added in 6.0
       Name == "avx2.vperm2i128" || // Added in 6.0
       Name == "sse.add.ss" || // Added in 4.0
       Name == "sse2.add.sd" || // Added in 4.0
@@ -211,48 +211,48 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name == "avx512.kxnor.w" || // Added in 7.0
       Name == "avx512.kortestc.w" || // Added in 7.0
       Name == "avx512.kortestz.w" || // Added in 7.0
-      Name.startswith("avx512.mask.pshuf.b.") || // Added in 4.0
-      Name.startswith("avx2.pmax") || // Added in 3.9
-      Name.startswith("avx2.pmin") || // Added in 3.9
-      Name.startswith("avx512.mask.pmax") || // Added in 4.0
-      Name.startswith("avx512.mask.pmin") || // Added in 4.0
-      Name.startswith("avx2.vbroadcast") || // Added in 3.8
-      Name.startswith("avx2.pbroadcast") || // Added in 3.8
-      Name.startswith("avx.vpermil.") || // Added in 3.1
-      Name.startswith("sse2.pshuf") || // Added in 3.9
-      Name.startswith("avx512.pbroadcast") || // Added in 3.9
-      Name.startswith("avx512.mask.broadcast.s") || // Added in 3.9
-      Name.startswith("avx512.mask.movddup") || // Added in 3.9
-      Name.startswith("avx512.mask.movshdup") || // Added in 3.9
-      Name.startswith("avx512.mask.movsldup") || // Added in 3.9
-      Name.startswith("avx512.mask.pshuf.d.") || // Added in 3.9
-      Name.startswith("avx512.mask.pshufl.w.") || // Added in 3.9
-      Name.startswith("avx512.mask.pshufh.w.") || // Added in 3.9
-      Name.startswith("avx512.mask.shuf.p") || // Added in 4.0
-      Name.startswith("avx512.mask.vpermil.p") || // Added in 3.9
-      Name.startswith("avx512.mask.perm.df.") || // Added in 3.9
-      Name.startswith("avx512.mask.perm.di.") || // Added in 3.9
-      Name.startswith("avx512.mask.punpckl") || // Added in 3.9
-      Name.startswith("avx512.mask.punpckh") || // Added in 3.9
-      Name.startswith("avx512.mask.unpckl.") || // Added in 3.9
-      Name.startswith("avx512.mask.unpckh.") || // Added in 3.9
-      Name.startswith("avx512.mask.pand.") || // Added in 3.9
-      Name.startswith("avx512.mask.pandn.") || // Added in 3.9
-      Name.startswith("avx512.mask.por.") || // Added in 3.9
-      Name.startswith("avx512.mask.pxor.") || // Added in 3.9
-      Name.startswith("avx512.mask.and.") || // Added in 3.9
-      Name.startswith("avx512.mask.andn.") || // Added in 3.9
-      Name.startswith("avx512.mask.or.") || // Added in 3.9
-      Name.startswith("avx512.mask.xor.") || // Added in 3.9
-      Name.startswith("avx512.mask.padd.") || // Added in 4.0
-      Name.startswith("avx512.mask.psub.") || // Added in 4.0
-      Name.startswith("avx512.mask.pmull.") || // Added in 4.0
-      Name.startswith("avx512.mask.cvtdq2pd.") || // Added in 4.0
-      Name.startswith("avx512.mask.cvtudq2pd.") || // Added in 4.0
-      Name.startswith("avx512.mask.cvtudq2ps.") || // Added in 7.0 updated 9.0
-      Name.startswith("avx512.mask.cvtqq2pd.") || // Added in 7.0 updated 9.0
-      Name.startswith("avx512.mask.cvtuqq2pd.") || // Added in 7.0 updated 9.0
-      Name.startswith("avx512.mask.cvtdq2ps.") || // Added in 7.0 updated 9.0
+      Name.starts_with("avx512.mask.pshuf.b.") || // Added in 4.0
+      Name.starts_with("avx2.pmax") || // Added in 3.9
+      Name.starts_with("avx2.pmin") || // Added in 3.9
+      Name.starts_with("avx512.mask.pmax") || // Added in 4.0
+      Name.starts_with("avx512.mask.pmin") || // Added in 4.0
+      Name.starts_with("avx2.vbroadcast") || // Added in 3.8
+      Name.starts_with("avx2.pbroadcast") || // Added in 3.8
+      Name.starts_with("avx.vpermil.") || // Added in 3.1
+      Name.starts_with("sse2.pshuf") || // Added in 3.9
+      Name.starts_with("avx512.pbroadcast") || // Added in 3.9
+      Name.starts_with("avx512.mask.broadcast.s") || // Added in 3.9
+      Name.starts_with("avx512.mask.movddup") || // Added in 3.9
+      Name.starts_with("avx512.mask.movshdup") || // Added in 3.9
+      Name.starts_with("avx512.mask.movsldup") || // Added in 3.9
+      Name.starts_with("avx512.mask.pshuf.d.") || // Added in 3.9
+      Name.starts_with("avx512.mask.pshufl.w.") || // Added in 3.9
+      Name.starts_with("avx512.mask.pshufh.w.") || // Added in 3.9
+      Name.starts_with("avx512.mask.shuf.p") || // Added in 4.0
+      Name.starts_with("avx512.mask.vpermil.p") || // Added in 3.9
+      Name.starts_with("avx512.mask.perm.df.") || // Added in 3.9
+      Name.starts_with("avx512.mask.perm.di.") || // Added in 3.9
+      Name.starts_with("avx512.mask.punpckl") || // Added in 3.9
+      Name.starts_with("avx512.mask.punpckh") || // Added in 3.9
+      Name.starts_with("avx512.mask.unpckl.") || // Added in 3.9
+      Name.starts_with("avx512.mask.unpckh.") || // Added in 3.9
+      Name.starts_with("avx512.mask.pand.") || // Added in 3.9
+      Name.starts_with("avx512.mask.pandn.") || // Added in 3.9
+      Name.starts_with("avx512.mask.por.") || // Added in 3.9
+      Name.starts_with("avx512.mask.pxor.") || // Added in 3.9
+      Name.starts_with("avx512.mask.and.") || // Added in 3.9
+      Name.starts_with("avx512.mask.andn.") || // Added in 3.9
+      Name.starts_with("avx512.mask.or.") || // Added in 3.9
+      Name.starts_with("avx512.mask.xor.") || // Added in 3.9
+      Name.starts_with("avx512.mask.padd.") || // Added in 4.0
+      Name.starts_with("avx512.mask.psub.") || // Added in 4.0
+      Name.starts_with("avx512.mask.pmull.") || // Added in 4.0
+      Name.starts_with("avx512.mask.cvtdq2pd.") || // Added in 4.0
+      Name.starts_with("avx512.mask.cvtudq2pd.") || // Added in 4.0
+      Name.starts_with("avx512.mask.cvtudq2ps.") || // Added in 7.0 updated 9.0
+      Name.starts_with("avx512.mask.cvtqq2pd.") || // Added in 7.0 updated 9.0
+      Name.starts_with("avx512.mask.cvtuqq2pd.") || // Added in 7.0 updated 9.0
+      Name.starts_with("avx512.mask.cvtdq2ps.") || // Added in 7.0 updated 9.0
       Name == "avx512.mask.vcvtph2ps.128" || // Added in 11.0
       Name == "avx512.mask.vcvtph2ps.256" || // Added in 11.0
       Name == "avx512.mask.cvtqq2ps.256" || // Added in 9.0
@@ -267,91 +267,91 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name == "avx512.mask.cvtps2pd.128" || // Added in 7.0
       Name == "avx512.mask.cvtps2pd.256" || // Added in 7.0
       Name == "avx512.cvtusi2sd" || // Added in 7.0
-      Name.startswith("avx512.mask.permvar.") || // Added in 7.0
+      Name.starts_with("avx512.mask.permvar.") || // Added in 7.0
       Name == "sse2.pmulu.dq" || // Added in 7.0
       Name == "sse41.pmuldq" || // Added in 7.0
       Name == "avx2.pmulu.dq" || // Added in 7.0
       Name == "avx2.pmul.dq" || // Added in 7.0
       Name == "avx512.pmulu.dq.512" || // Added in 7.0
       Name == "avx512.pmul.dq.512" || // Added in 7.0
-      Name.startswith("avx512.mask.pmul.dq.") || // Added in 4.0
-      Name.startswith("avx512.mask.pmulu.dq.") || // Added in 4.0
-      Name.startswith("avx512.mask.pmul.hr.sw.") || // Added in 7.0
-      Name.startswith("avx512.mask.pmulh.w.") || // Added in 7.0
-      Name.startswith("avx512.mask.pmulhu.w.") || // Added in 7.0
-      Name.startswith("avx512.mask.pmaddw.d.") || // Added in 7.0
-      Name.startswith("avx512.mask.pmaddubs.w.") || // Added in 7.0
-      Name.startswith("avx512.mask.packsswb.") || // Added in 5.0
-      Name.startswith("avx512.mask.packssdw.") || // Added in 5.0
-      Name.startswith("avx512.mask.packuswb.") || // Added in 5.0
-      Name.startswith("avx512.mask.packusdw.") || // Added in 5.0
-      Name.startswith("avx512.mask.cmp.b") || // Added in 5.0
-      Name.startswith("avx512.mask.cmp.d") || // Added in 5.0
-      Name.startswith("avx512.mask.cmp.q") || // Added in 5.0
-      Name.startswith("avx512.mask.cmp.w") || // Added in 5.0
-      Name.startswith("avx512.cmp.p") || // Added in 12.0
-      Name.startswith("avx512.mask.ucmp.") || // Added in 5.0
-      Name.startswith("avx512.cvtb2mask.") || // Added in 7.0
-      Name.startswith("avx512.cvtw2mask.") || // Added in 7.0
-      Name.startswith("avx512.cvtd2mask.") || // Added in 7.0
-      Name.startswith("avx512.cvtq2mask.") || // Added in 7.0
-      Name.startswith("avx512.mask.vpermilvar.") || // Added in 4.0
-      Name.startswith("avx512.mask.psll.d") || // Added in 4.0
-      Name.startswith("avx512.mask.psll.q") || // Added in 4.0
-      Name.startswith("avx512.mask.psll.w") || // Added in 4.0
-      Name.startswith("avx512.mask.psra.d") || // Added in 4.0
-      Name.startswith("avx512.mask.psra.q") || // Added in 4.0
-      Name.startswith("avx512.mask.psra.w") || // Added in 4.0
-      Name.startswith("avx512.mask.psrl.d") || // Added in 4.0
-      Name.startswith("avx512.mask.psrl.q") || // Added in 4.0
-      Name.startswith("avx512.mask.psrl.w") || // Added in 4.0
-      Name.startswith("avx512.mask.pslli") || // Added in 4.0
-      Name.startswith("avx512.mask.psrai") || // Added in 4.0
-      Name.startswith("avx512.mask.psrli") || // Added in 4.0
-      Name.startswith("avx512.mask.psllv") || // Added in 4.0
-      Name.startswith("avx512.mask.psrav") || // Added in 4.0
-      Name.startswith("avx512.mask.psrlv") || // Added in 4.0
-      Name.startswith("sse41.pmovsx") || // Added in 3.8
-      Name.startswith("sse41.pmovzx") || // Added in 3.9
-      Name.startswith("avx2.pmovsx") || // Added in 3.9
-      Name.startswith("avx2.pmovzx") || // Added in 3.9
-      Name.startswith("avx512.mask.pmovsx") || // Added in 4.0
-      Name.startswith("avx512.mask.pmovzx") || // Added in 4.0
-      Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0
-      Name.startswith("avx512.mask.pternlog.") || // Added in 7.0
-      Name.startswith("avx512.maskz.pternlog.") || // Added in 7.0
-      Name.startswith("avx512.mask.vpmadd52") || // Added in 7.0
-      Name.startswith("avx512.maskz.vpmadd52") || // Added in 7.0
-      Name.startswith("avx512.mask.vpermi2var.") || // Added in 7.0
-      Name.startswith("avx512.mask.vpermt2var.") || // Added in 7.0
-      Name.startswith("avx512.maskz.vpermt2var.") || // Added in 7.0
-      Name.startswith("avx512.mask.vpdpbusd.") || // Added in 7.0
-      Name.startswith("avx512.maskz.vpdpbusd.") || // Added in 7.0
-      Name.startswith("avx512.mask.vpdpbusds.") || // Added in 7.0
-      Name.startswith("avx512.maskz.vpdpbusds.") || // Added in 7.0
-      Name.startswith("avx512.mask.vpdpwssd.") || // Added in 7.0
-      Name.startswith("avx512.maskz.vpdpwssd.") || // Added in 7.0
-      Name.startswith("avx512.mask.vpdpwssds.") || // Added in 7.0
-      Name.startswith("avx512.maskz.vpdpwssds.") || // Added in 7.0
-      Name.startswith("avx512.mask.dbpsadbw.") || // Added in 7.0
-      Name.startswith("avx512.mask.vpshld.") || // Added in 7.0
-      Name.startswith("avx512.mask.vpshrd.") || // Added in 7.0
-      Name.startswith("avx512.mask.vpshldv.") || // Added in 8.0
-      Name.startswith("avx512.mask.vpshrdv.") || // Added in 8.0
-      Name.startswith("avx512.maskz.vpshldv.") || // Added in 8.0
-      Name.startswith("avx512.maskz.vpshrdv.") || // Added in 8.0
-      Name.startswith("avx512.vpshld.") || // Added in 8.0
-      Name.startswith("avx512.vpshrd.") || // Added in 8.0
-      Name.startswith("avx512.mask.add.p") || // Added in 7.0. 128/256 in 4.0
-      Name.startswith("avx512.mask.sub.p") || // Added in 7.0. 128/256 in 4.0
-      Name.startswith("avx512.mask.mul.p") || // Added in 7.0. 128/256 in 4.0
-      Name.startswith("avx512.mask.div.p") || // Added in 7.0. 128/256 in 4.0
-      Name.startswith("avx512.mask.max.p") || // Added in 7.0. 128/256 in 5.0
-      Name.startswith("avx512.mask.min.p") || // Added in 7.0. 128/256 in 5.0
-      Name.startswith("avx512.mask.fpclass.p") || // Added in 7.0
-      Name.startswith("avx512.mask.vpshufbitqmb.") || // Added in 8.0
-      Name.startswith("avx512.mask.pmultishift.qb.") || // Added in 8.0
-      Name.startswith("avx512.mask.conflict.") || // Added in 9.0
+      Name.starts_with("avx512.mask.pmul.dq.") || // Added in 4.0
+      Name.starts_with("avx512.mask.pmulu.dq.") || // Added in 4.0
+      Name.starts_with("avx512.mask.pmul.hr.sw.") || // Added in 7.0
+      Name.starts_with("avx512.mask.pmulh.w.") || // Added in 7.0
+      Name.starts_with("avx512.mask.pmulhu.w.") || // Added in 7.0
+      Name.starts_with("avx512.mask.pmaddw.d.") || // Added in 7.0
+      Name.starts_with("avx512.mask.pmaddubs.w.") || // Added in 7.0
+      Name.starts_with("avx512.mask.packsswb.") || // Added in 5.0
+      Name.starts_with("avx512.mask.packssdw.") || // Added in 5.0
+      Name.starts_with("avx512.mask.packuswb.") || // Added in 5.0
+      Name.starts_with("avx512.mask.packusdw.") || // Added in 5.0
+      Name.starts_with("avx512.mask.cmp.b") || // Added in 5.0
+      Name.starts_with("avx512.mask.cmp.d") || // Added in 5.0
+      Name.starts_with("avx512.mask.cmp.q") || // Added in 5.0
+      Name.starts_with("avx512.mask.cmp.w") || // Added in 5.0
+      Name.starts_with("avx512.cmp.p") || // Added in 12.0
+      Name.starts_with("avx512.mask.ucmp.") || // Added in 5.0
+      Name.starts_with("avx512.cvtb2mask.") || // Added in 7.0
+      Name.starts_with("avx512.cvtw2mask.") || // Added in 7.0
+      Name.starts_with("avx512.cvtd2mask.") || // Added in 7.0
+      Name.starts_with("avx512.cvtq2mask.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpermilvar.") || // Added in 4.0
+      Name.starts_with("avx512.mask.psll.d") || // Added in 4.0
+      Name.starts_with("avx512.mask.psll.q") || // Added in 4.0
+      Name.starts_with("avx512.mask.psll.w") || // Added in 4.0
+      Name.starts_with("avx512.mask.psra.d") || // Added in 4.0
+      Name.starts_with("avx512.mask.psra.q") || // Added in 4.0
+      Name.starts_with("avx512.mask.psra.w") || // Added in 4.0
+      Name.starts_with("avx512.mask.psrl.d") || // Added in 4.0
+      Name.starts_with("avx512.mask.psrl.q") || // Added in 4.0
+      Name.starts_with("avx512.mask.psrl.w") || // Added in 4.0
+      Name.starts_with("avx512.mask.pslli") || // Added in 4.0
+      Name.starts_with("avx512.mask.psrai") || // Added in 4.0
+      Name.starts_with("avx512.mask.psrli") || // Added in 4.0
+      Name.starts_with("avx512.mask.psllv") || // Added in 4.0
+      Name.starts_with("avx512.mask.psrav") || // Added in 4.0
+      Name.starts_with("avx512.mask.psrlv") || // Added in 4.0
+      Name.starts_with("sse41.pmovsx") || // Added in 3.8
+      Name.starts_with("sse41.pmovzx") || // Added in 3.9
+      Name.starts_with("avx2.pmovsx") || // Added in 3.9
+      Name.starts_with("avx2.pmovzx") || // Added in 3.9
+      Name.starts_with("avx512.mask.pmovsx") || // Added in 4.0
+      Name.starts_with("avx512.mask.pmovzx") || // Added in 4.0
+      Name.starts_with("avx512.mask.lzcnt.") || // Added in 5.0
+      Name.starts_with("avx512.mask.pternlog.") || // Added in 7.0
+      Name.starts_with("avx512.maskz.pternlog.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpmadd52") || // Added in 7.0
+      Name.starts_with("avx512.maskz.vpmadd52") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpermi2var.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpermt2var.") || // Added in 7.0
+      Name.starts_with("avx512.maskz.vpermt2var.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpdpbusd.") || // Added in 7.0
+      Name.starts_with("avx512.maskz.vpdpbusd.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpdpbusds.") || // Added in 7.0
+      Name.starts_with("avx512.maskz.vpdpbusds.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpdpwssd.") || // Added in 7.0
+      Name.starts_with("avx512.maskz.vpdpwssd.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpdpwssds.") || // Added in 7.0
+      Name.starts_with("avx512.maskz.vpdpwssds.") || // Added in 7.0
+      Name.starts_with("avx512.mask.dbpsadbw.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpshld.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpshrd.") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpshldv.") || // Added in 8.0
+      Name.starts_with("avx512.mask.vpshrdv.") || // Added in 8.0
+      Name.starts_with("avx512.maskz.vpshldv.") || // Added in 8.0
+      Name.starts_with("avx512.maskz.vpshrdv.") || // Added in 8.0
+      Name.starts_with("avx512.vpshld.") || // Added in 8.0
+      Name.starts_with("avx512.vpshrd.") || // Added in 8.0
+      Name.starts_with("avx512.mask.add.p") || // Added in 7.0. 128/256 in 4.0
+      Name.starts_with("avx512.mask.sub.p") || // Added in 7.0. 128/256 in 4.0
+      Name.starts_with("avx512.mask.mul.p") || // Added in 7.0. 128/256 in 4.0
+      Name.starts_with("avx512.mask.div.p") || // Added in 7.0. 128/256 in 4.0
+      Name.starts_with("avx512.mask.max.p") || // Added in 7.0. 128/256 in 5.0
+      Name.starts_with("avx512.mask.min.p") || // Added in 7.0. 128/256 in 5.0
+      Name.starts_with("avx512.mask.fpclass.p") || // Added in 7.0
+      Name.starts_with("avx512.mask.vpshufbitqmb.") || // Added in 8.0
+      Name.starts_with("avx512.mask.pmultishift.qb.") || // Added in 8.0
+      Name.starts_with("avx512.mask.conflict.") || // Added in 9.0
       Name == "avx512.mask.pmov.qd.256" || // Added in 9.0
       Name == "avx512.mask.pmov.qd.512" || // Added in 9.0
       Name == "avx512.mask.pmov.wb.256" || // Added in 9.0
@@ -367,85 +367,85 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name == "avx.cvtdq2.pd.256" || // Added in 3.9
       Name == "avx.cvtdq2.ps.256" || // Added in 7.0
       Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
-      Name.startswith("vcvtph2ps.") || // Added in 11.0
-      Name.startswith("avx.vinsertf128.") || // Added in 3.7
+      Name.starts_with("vcvtph2ps.") || // Added in 11.0
+      Name.starts_with("avx.vinsertf128.") || // Added in 3.7
       Name == "avx2.vinserti128" || // Added in 3.7
-      Name.startswith("avx512.mask.insert") || // Added in 4.0
-      Name.startswith("avx.vextractf128.") || // Added in 3.7
+      Name.starts_with("avx512.mask.insert") || // Added in 4.0
+      Name.starts_with("avx.vextractf128.") || // Added in 3.7
       Name == "avx2.vextracti128" || // Added in 3.7
-      Name.startswith("avx512.mask.vextract") || // Added in 4.0
-      Name.startswith("sse4a.movnt.") || // Added in 3.9
-      Name.startswith("avx.movnt.") || // Added in 3.2
-      Name.startswith("avx512.storent.") || // Added in 3.9
+      Name.starts_with("avx512.mask.vextract") || // Added in 4.0
+      Name.starts_with("sse4a.movnt.") || // Added in 3.9
+      Name.starts_with("avx.movnt.") || // Added in 3.2
+      Name.starts_with("avx512.storent.") || // Added in 3.9
       Name == "sse41.movntdqa" || // Added in 5.0
       Name == "avx2.movntdqa" || // Added in 5.0
       Name == "avx512.movntdqa" || // Added in 5.0
       Name == "sse2.storel.dq" || // Added in 3.9
-      Name.startswith("sse.storeu.") || // Added in 3.9
-      Name.startswith("sse2.storeu.") || // Added in 3.9
-      Name.startswith("avx.storeu.") || // Added in 3.9
-      Name.startswith("avx512.mask.storeu.") || // Added in 3.9
-      Name.startswith("avx512.mask.store.p") || // Added in 3.9
-      Name.startswith("avx512.mask.store.b.") || // Added in 3.9
-      Name.startswith("avx512.mask.store.w.") || // Added in 3.9
-      Name.startswith("avx512.mask.store.d.") || // Added in 3.9
-      Name.startswith("avx512.mask.store.q.") || // Added in 3.9
+      Name.starts_with("sse.storeu.") || // Added in 3.9
+      Name.starts_with("sse2.storeu.") || // Added in 3.9
+      Name.starts_with("avx.storeu.") || // Added in 3.9
+      Name.starts_with("avx512.mask.storeu.") || // Added in 3.9
+      Name.starts_with("avx512.mask.store.p") || // Added in 3.9
+      Name.starts_with("avx512.mask.store.b.") || // Added in 3.9
+      Name.starts_with("avx512.mask.store.w.") || // Added in 3.9
+      Name.starts_with("avx512.mask.store.d.") || // Added in 3.9
+      Name.starts_with("avx512.mask.store.q.") || // Added in 3.9
       Name == "avx512.mask.store.ss" || // Added in 7.0
-      Name.startswith("avx512.mask.loadu.") || // Added in 3.9
-      Name.startswith("avx512.mask.load.") || // Added in 3.9
-      Name.startswith("avx512.mask.expand.load.") || // Added in 7.0
-      Name.startswith("avx512.mask.compress.store.") || // Added in 7.0
-      Name.startswith("avx512.mask.expand.b") || // Added in 9.0
-      Name.startswith("avx512.mask.expand.w") || // Added in 9.0
-      Name.startswith("avx512.mask.expand.d") || // Added in 9.0
-      Name.startswith("avx512.mask.expand.q") || // Added in 9.0
-      Name.startswith("avx512.mask.expand.p") || // Added in 9.0
-      Name.startswith("avx512.mask.compress.b") || // Added in 9.0
-      Name.startswith("avx512.mask.compress.w") || // Added in 9.0
-      Name.startswith("avx512.mask.compress.d") || // Added in 9.0
-      Name.startswith("avx512.mask.compress.q") || // Added in 9.0
-      Name.startswith("avx512.mask.compress.p") || // Added in 9.0
+      Name.starts_with("avx512.mask.loadu.") || // Added in 3.9
+      Name.starts_with("avx512.mask.load.") || // Added in 3.9
+      Name.starts_with("avx512.mask.expand.load.") || // Added in 7.0
+      Name.starts_with("avx512.mask.compress.store.") || // Added in 7.0
+      Name.starts_with("avx512.mask.expand.b") || // Added in 9.0
+      Name.starts_with("avx512.mask.expand.w") || // Added in 9.0
+      Name.starts_with("avx512.mask.expand.d") || // Added in 9.0
+      Name.starts_with("avx512.mask.expand.q") || // Added in 9.0
+      Name.starts_with("avx512.mask.expand.p") || // Added in 9.0
+      Name.starts_with("avx512.mask.compress.b") || // Added in 9.0
+      Name.starts_with("avx512.mask.compress.w") || // Added in 9.0
+      Name.starts_with("avx512.mask.compress.d") || // Added in 9.0
+      Name.starts_with("avx512.mask.compress.q") || // Added in 9.0
+      Name.starts_with("avx512.mask.compress.p") || // Added in 9.0
       Name == "sse42.crc32.64.8" || // Added in 3.4
-      Name.startswith("avx.vbroadcast.s") || // Added in 3.5
-      Name.startswith("avx512.vbroadcast.s") || // Added in 7.0
-      Name.startswith("avx512.mask.palignr.") || // Added in 3.9
-      Name.startswith("avx512.mask.valign.") || // Added in 4.0
-      Name.startswith("sse2.psll.dq") || // Added in 3.7
-      Name.startswith("sse2.psrl.dq") || // Added in 3.7
-      Name.startswith("avx2.psll.dq") || // Added in 3.7
-      Name.startswith("avx2.psrl.dq") || // Added in 3.7
-      Name.startswith("avx512.psll.dq") || // Added in 3.9
-      Name.startswith("avx512.psrl.dq") || // Added in 3.9
+      Name.starts_with("avx.vbroadcast.s") || // Added in 3.5
+      Name.starts_with("avx512.vbroadcast.s") || // Added in 7.0
+      Name.starts_with("avx512.mask.palignr.") || // Added in 3.9
+      Name.starts_with("avx512.mask.valign.") || // Added in 4.0
+      Name.starts_with("sse2.psll.dq") || // Added in 3.7
+      Name.starts_with("sse2.psrl.dq") || // Added in 3.7
+      Name.starts_with("avx2.psll.dq") || // Added in 3.7
+      Name.starts_with("avx2.psrl.dq") || // Added in 3.7
+      Name.starts_with("avx512.psll.dq") || // Added in 3.9
+      Name.starts_with("avx512.psrl.dq") || // Added in 3.9
       Name == "sse41.pblendw" || // Added in 3.7
-      Name.startswith("sse41.blendp") || // Added in 3.7
-      Name.startswith("avx.blend.p") || // Added in 3.7
+      Name.starts_with("sse41.blendp") || // Added in 3.7
+      Name.starts_with("avx.blend.p") || // Added in 3.7
       Name == "avx2.pblendw" || // Added in 3.7
-      Name.startswith("avx2.pblendd.") || // Added in 3.7
-      Name.startswith("avx.vbroadcastf128") || // Added in 4.0
+      Name.starts_with("avx2.pblendd.") || // Added in 3.7
+      Name.starts_with("avx.vbroadcastf128") || // Added in 4.0
       Name == "avx2.vbroadcasti128" || // Added in 3.7
-      Name.startswith("avx512.mask.broadcastf32x4.") || // Added in 6.0
-      Name.startswith("avx512.mask.broadcastf64x2.") || // Added in 6.0
-      Name.startswith("avx512.mask.broadcastf32x8.") || // Added in 6.0
-      Name.startswith("avx512.mask.broadcastf64x4.") || // Added in 6.0
-      Name.startswith("avx512.mask.broadcasti32x4.") || // Added in 6.0
-      Name.startswith("avx512.mask.broadcasti64x2.") || // Added in 6.0
-      Name.startswith("avx512.mask.broadcasti32x8.") || // Added in 6.0
-      Name.startswith("avx512.mask.broadcasti64x4.") || // Added in 6.0
+      Name.starts_with("avx512.mask.broadcastf32x4.") || // Added in 6.0
+      Name.starts_with("avx512.mask.broadcastf64x2.") || // Added in 6.0
+      Name.starts_with("avx512.mask.broadcastf32x8.") || // Added in 6.0
+      Name.starts_with("avx512.mask.broadcastf64x4.") || // Added in 6.0
+      Name.starts_with("avx512.mask.broadcasti32x4.") || // Added in 6.0
+      Name.starts_with("avx512.mask.broadcasti64x2.") || // Added in 6.0
+      Name.starts_with("avx512.mask.broadcasti32x8.") || // Added in 6.0
+      Name.starts_with("avx512.mask.broadcasti64x4.") || // Added in 6.0
       Name == "xop.vpcmov" || // Added in 3.8
       Name == "xop.vpcmov.256" || // Added in 5.0
-      Name.startswith("avx512.mask.move.s") || // Added in 4.0
-      Name.startswith("avx512.cvtmask2") || // Added in 5.0
-      Name.startswith("xop.vpcom") || // Added in 3.2, Updated in 9.0
-      Name.startswith("xop.vprot") || // Added in 8.0
-      Name.startswith("avx512.prol") || // Added in 8.0
-      Name.startswith("avx512.pror") || // Added in 8.0
-      Name.startswith("avx512.mask.prorv.") || // Added in 8.0
-      Name.startswith("avx512.mask.pror.") ||  // Added in 8.0
-      Name.startswith("avx512.mask.prolv.") || // Added in 8.0
-      Name.startswith("avx512.mask.prol.") ||  // Added in 8.0
-      Name.startswith("avx512.ptestm") || //Added in 6.0
-      Name.startswith("avx512.ptestnm") || //Added in 6.0
-      Name.startswith("avx512.mask.pavg")) // Added in 6.0
+      Name.starts_with("avx512.mask.move.s") || // Added in 4.0
+      Name.starts_with("avx512.cvtmask2") || // Added in 5.0
+      Name.starts_with("xop.vpcom") || // Added in 3.2, Updated in 9.0
+      Name.starts_with("xop.vprot") || // Added in 8.0
+      Name.starts_with("avx512.prol") || // Added in 8.0
+      Name.starts_with("avx512.pror") || // Added in 8.0
+      Name.starts_with("avx512.mask.prorv.") || // Added in 8.0
+      Name.starts_with("avx512.mask.pror.") ||  // Added in 8.0
+      Name.starts_with("avx512.mask.prolv.") || // Added in 8.0
+      Name.starts_with("avx512.mask.prol.") ||  // Added in 8.0
+      Name.starts_with("avx512.ptestm") || //Added in 6.0
+      Name.starts_with("avx512.ptestnm") || //Added in 6.0
+      Name.starts_with("avx512.mask.pavg")) // Added in 6.0
     return true;
 
   return false;
@@ -454,7 +454,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
 static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
                                         Function *&NewFn) {
   // Only handle intrinsics that start with "x86.".
-  if (!Name.startswith("x86."))
+  if (!Name.starts_with("x86."))
     return false;
   // Remove "x86." prefix.
   Name = Name.substr(4);
@@ -476,7 +476,7 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
   }
 
   // SSE4.1 ptest functions may have an old signature.
-  if (Name.startswith("sse41.ptest")) { // Added in 3.2
+  if (Name.starts_with("sse41.ptest")) { // Added in 3.2
     if (Name.substr(11) == "c")
       return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestc, NewFn);
     if (Name.substr(11) == "z")
@@ -551,20 +551,20 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
         F, Intrinsic::x86_avx512bf16_dpbf16ps_512, NewFn);
 
   // frcz.ss/sd may need to have an argument dropped. Added in 3.2
-  if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
+  if (Name.starts_with("xop.vfrcz.ss") && F->arg_size() == 2) {
     rename(F);
     NewFn = Intrinsic::getDeclaration(F->getParent(),
                                       Intrinsic::x86_xop_vfrcz_ss);
     return true;
   }
-  if (Name.startswith("xop.vfrcz.sd") && F->arg_size() == 2) {
+  if (Name.starts_with("xop.vfrcz.sd") && F->arg_size() == 2) {
     rename(F);
     NewFn = Intrinsic::getDeclaration(F->getParent(),
                                       Intrinsic::x86_xop_vfrcz_sd);
     return true;
   }
   // Upgrade any XOP PERMIL2 index operand still using a float/double vector.
-  if (Name.startswith("xop.vpermil2")) { // Added in 3.9
+  if (Name.starts_with("xop.vpermil2")) { // Added in 3.9
     auto Idx = F->getFunctionType()->getParamType(2);
     if (Idx->isFPOrFPVectorTy()) {
       rename(F);
@@ -677,24 +677,24 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
 
   // Quickly eliminate it, if it's not a candidate.
   StringRef Name = F->getName();
-  if (Name.size() <= 7 || !Name.startswith("llvm."))
+  if (Name.size() <= 7 || !Name.starts_with("llvm."))
     return false;
   Name = Name.substr(5); // Strip off "llvm."
 
   switch (Name[0]) {
   default: break;
   case 'a': {
-    if (Name.startswith("arm.rbit") || Name.startswith("aarch64.rbit")) {
+    if (Name.starts_with("arm.rbit") || Name.starts_with("aarch64.rbit")) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::bitreverse,
                                         F->arg_begin()->getType());
       return true;
     }
-    if (Name.startswith("aarch64.neon.frintn")) {
+    if (Name.starts_with("aarch64.neon.frintn")) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::roundeven,
                                         F->arg_begin()->getType());
       return true;
     }
-    if (Name.startswith("aarch64.neon.rbit")) {
+    if (Name.starts_with("aarch64.neon.rbit")) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::bitreverse,
                                         F->arg_begin()->getType());
       return true;
@@ -730,13 +730,13 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Ty);
       return true;
     }
-    if (Name.startswith("aarch64.sve.tuple.get")) {
+    if (Name.starts_with("aarch64.sve.tuple.get")) {
       Type *Tys[] = {F->getReturnType(), F->arg_begin()->getType()};
       NewFn = Intrinsic::getDeclaration(F->getParent(),
                                         Intrinsic::vector_extract, Tys);
       return true;
     }
-    if (Name.startswith("aarch64.sve.tuple.set")) {
+    if (Name.starts_with("aarch64.sve.tuple.set")) {
       auto Args = F->getFunctionType()->params();
       Type *Tys[] = {Args[0], Args[2], Args[1]};
       NewFn = Intrinsic::getDeclaration(F->getParent(),
@@ -752,7 +752,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
                                         Intrinsic::vector_insert, Tys);
       return true;
     }
-    if (Name.startswith("arm.neon.vclz")) {
+    if (Name.starts_with("arm.neon.vclz")) {
       Type* args[2] = {
         F->arg_begin()->getType(),
         Type::getInt1Ty(F->getContext())
@@ -765,7 +765,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
                                "llvm.ctlz." + Name.substr(14), F->getParent());
       return true;
     }
-    if (Name.startswith("arm.neon.vcnt")) {
+    if (Name.starts_with("arm.neon.vcnt")) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
                                         F->arg_begin()->getType());
       return true;
@@ -796,27 +796,27 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer);
       return true;
     }
-    if (Name.startswith("arm.neon.vqadds.")) {
+    if (Name.starts_with("arm.neon.vqadds.")) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::sadd_sat,
                                         F->arg_begin()->getType());
       return true;
     }
-    if (Name.startswith("arm.neon.vqaddu.")) {
+    if (Name.starts_with("arm.neon.vqaddu.")) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::uadd_sat,
                                         F->arg_begin()->getType());
       return true;
     }
-    if (Name.startswith("arm.neon.vqsubs.")) {
+    if (Name.starts_with("arm.neon.vqsubs.")) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ssub_sat,
                                         F->arg_begin()->getType());
       return true;
     }
-    if (Name.startswith("arm.neon.vqsubu.")) {
+    if (Name.starts_with("arm.neon.vqsubu.")) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::usub_sat,
                                         F->arg_begin()->getType());
       return true;
     }
-    if (Name.startswith("aarch64.neon.addp")) {
+    if (Name.starts_with("aarch64.neon.addp")) {
       if (F->arg_size() != 2)
         break; // Invalid IR.
       VectorType *Ty = dyn_cast<VectorType>(F->getReturnType());
@@ -829,9 +829,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
 
     // Changed in 12.0: bfdot accept v4bf16 and v8bf16 instead of v8i8 and v16i8
     // respectively
-    if ((Name.startswith("arm.neon.bfdot.") ||
-         Name.startswith("aarch64.neon.bfdot.")) &&
-        Name.endswith("i8")) {
+    if ((Name.starts_with("arm.neon.bfdot.") ||
+         Name.starts_with("aarch64.neon.bfdot.")) &&
+        Name.ends_with("i8")) {
       Intrinsic::ID IID =
           StringSwitch<Intrinsic::ID>(Name)
               .Cases("arm.neon.bfdot.v2f32.v8i8",
@@ -858,9 +858,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
 
     // Changed in 12.0: bfmmla, bfmlalb and bfmlalt are not polymorphic anymore
     // and accept v8bf16 instead of v16i8
-    if ((Name.startswith("arm.neon.bfm") ||
-         Name.startswith("aarch64.neon.bfm")) &&
-        Name.endswith(".v4f32.v16i8")) {
+    if ((Name.starts_with("arm.neon.bfm") ||
+         Name.starts_with("aarch64.neon.bfm")) &&
+        Name.ends_with(".v4f32.v16i8")) {
       Intrinsic::ID IID =
           StringSwitch<Intrinsic::ID>(Name)
               .Case("arm.neon.bfmmla.v4f32.v16i8",
@@ -920,14 +920,14 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         return true;
       }
 
-      if (Name.startswith("atomic.inc") || Name.startswith("atomic.dec")) {
+      if (Name.starts_with("atomic.inc") || Name.starts_with("atomic.dec")) {
         // This was replaced with atomicrmw uinc_wrap and udec_wrap, so there's no
         // new declaration.
         NewFn = nullptr;
         return true;
       }
 
-      if (Name.startswith("ldexp.")) {
+      if (Name.starts_with("ldexp.")) {
         // Target specific intrinsic became redundant
         NewFn = Intrinsic::getDeclaration(
           F->getParent(), Intrinsic::ldexp,
@@ -939,13 +939,13 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     break;
   }
   case 'c': {
-    if (Name.startswith("ctlz.") && F->arg_size() == 1) {
+    if (Name.starts_with("ctlz.") && F->arg_size() == 1) {
       rename(F);
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
                                         F->arg_begin()->getType());
       return true;
     }
-    if (Name.startswith("cttz.") && F->arg_size() == 1) {
+    if (Name.starts_with("cttz.") && F->arg_size() == 1) {
       rename(F);
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::cttz,
                                         F->arg_begin()->getType());
@@ -1032,14 +1032,14 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break; // No other 'e*'.
   case 'f':
-    if (Name.startswith("flt.rounds")) {
+    if (Name.starts_with("flt.rounds")) {
       rename(F);
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::get_rounding);
       return true;
     }
     break;
   case 'i':
-    if (Name.startswith("invariant.group.barrier")) {
+    if (Name.starts_with("invariant.group.barrier")) {
       // Rename invariant.group.barrier to launder.invariant.group
       auto Args = F->getFunctionType()->params();
       Type* ObjectPtr[1] = {Args[0]};
@@ -1066,7 +1066,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         return true;
       }
     }
-    if (Name.startswith("memset.") && F->arg_size() == 5) {
+    if (Name.starts_with("memset.") && F->arg_size() == 5) {
       rename(F);
       // Get the types of dest, and len
       const auto *FT = F->getFunctionType();
@@ -1121,7 +1121,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Expand = Name == "i" || Name == "ll" || Name == "ui" || Name == "ull";
       else if (Name.consume_front("atomic.load.add."))
         // nvvm.atomic.load.add.{f32.p,f64.p}
-        Expand = Name.startswith("f32.p") || Name.startswith("f64.p");
+        Expand = Name.starts_with("f32.p") || Name.starts_with("f64.p");
       else
         Expand = false;
 
@@ -1136,7 +1136,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   case 'o':
     // We only need to change the name to match the mangling including the
     // address space.
-    if (Name.startswith("objectsize.")) {
+    if (Name.starts_with("objectsize.")) {
       Type *Tys[2] = { F->getReturnType(), F->arg_begin()->getType() };
       if (F->arg_size() == 2 || F->arg_size() == 3 ||
           F->getName() !=
@@ -1150,7 +1150,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     break;
 
   case 'p':
-    if (Name.startswith("ptr.annotation.") && F->arg_size() == 4) {
+    if (Name.starts_with("ptr.annotation.") && F->arg_size() == 4) {
       rename(F);
       NewFn = Intrinsic::getDeclaration(
           F->getParent(), Intrinsic::ptr_annotation,
@@ -1866,7 +1866,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
   unsigned VecWidth = CI.getType()->getPrimitiveSizeInBits();
   unsigned EltWidth = CI.getType()->getScalarSizeInBits();
   Intrinsic::ID IID;
-  if (Name.startswith("max.p")) {
+  if (Name.starts_with("max.p")) {
     if (VecWidth == 128 && EltWidth == 32)
       IID = Intrinsic::x86_sse_max_ps;
     else if (VecWidth == 128 && EltWidth == 64)
@@ -1877,7 +1877,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx_max_pd_256;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("min.p")) {
+  } else if (Name.starts_with("min.p")) {
     if (VecWidth == 128 && EltWidth == 32)
       IID = Intrinsic::x86_sse_min_ps;
     else if (VecWidth == 128 && EltWidth == 64)
@@ -1888,7 +1888,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx_min_pd_256;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("pshuf.b.")) {
+  } else if (Name.starts_with("pshuf.b.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_ssse3_pshuf_b_128;
     else if (VecWidth == 256)
@@ -1897,7 +1897,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_pshuf_b_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("pmul.hr.sw.")) {
+  } else if (Name.starts_with("pmul.hr.sw.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_ssse3_pmul_hr_sw_128;
     else if (VecWidth == 256)
@@ -1906,7 +1906,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_pmul_hr_sw_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("pmulh.w.")) {
+  } else if (Name.starts_with("pmulh.w.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_sse2_pmulh_w;
     else if (VecWidth == 256)
@@ -1915,7 +1915,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_pmulh_w_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("pmulhu.w.")) {
+  } else if (Name.starts_with("pmulhu.w.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_sse2_pmulhu_w;
     else if (VecWidth == 256)
@@ -1924,7 +1924,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_pmulhu_w_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("pmaddw.d.")) {
+  } else if (Name.starts_with("pmaddw.d.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_sse2_pmadd_wd;
     else if (VecWidth == 256)
@@ -1933,7 +1933,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_pmaddw_d_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("pmaddubs.w.")) {
+  } else if (Name.starts_with("pmaddubs.w.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_ssse3_pmadd_ub_sw_128;
     else if (VecWidth == 256)
@@ -1942,7 +1942,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_pmaddubs_w_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("packsswb.")) {
+  } else if (Name.starts_with("packsswb.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_sse2_packsswb_128;
     else if (VecWidth == 256)
@@ -1951,7 +1951,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_packsswb_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("packssdw.")) {
+  } else if (Name.starts_with("packssdw.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_sse2_packssdw_128;
     else if (VecWidth == 256)
@@ -1960,7 +1960,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_packssdw_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("packuswb.")) {
+  } else if (Name.starts_with("packuswb.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_sse2_packuswb_128;
     else if (VecWidth == 256)
@@ -1969,7 +1969,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_packuswb_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("packusdw.")) {
+  } else if (Name.starts_with("packusdw.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_sse41_packusdw;
     else if (VecWidth == 256)
@@ -1978,7 +1978,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_packusdw_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("vpermilvar.")) {
+  } else if (Name.starts_with("vpermilvar.")) {
     if (VecWidth == 128 && EltWidth == 32)
       IID = Intrinsic::x86_avx_vpermilvar_ps;
     else if (VecWidth == 128 && EltWidth == 64)
@@ -2003,7 +2003,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
     IID = Intrinsic::x86_sse2_cvttps2dq;
   } else if (Name == "cvttps2dq.256") {
     IID = Intrinsic::x86_avx_cvtt_ps2dq_256;
-  } else if (Name.startswith("permvar.")) {
+  } else if (Name.starts_with("permvar.")) {
     bool IsFloat = CI.getType()->isFPOrFPVectorTy();
     if (VecWidth == 256 && EltWidth == 32 && IsFloat)
       IID = Intrinsic::x86_avx2_permps;
@@ -2035,7 +2035,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_permvar_qi_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("dbpsadbw.")) {
+  } else if (Name.starts_with("dbpsadbw.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_avx512_dbpsadbw_128;
     else if (VecWidth == 256)
@@ -2044,7 +2044,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_dbpsadbw_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("pmultishift.qb.")) {
+  } else if (Name.starts_with("pmultishift.qb.")) {
     if (VecWidth == 128)
       IID = Intrinsic::x86_avx512_pmultishift_qb_128;
     else if (VecWidth == 256)
@@ -2053,7 +2053,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_pmultishift_qb_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("conflict.")) {
+  } else if (Name.starts_with("conflict.")) {
     if (Name[9] == 'd' && VecWidth == 128)
       IID = Intrinsic::x86_avx512_conflict_d_128;
     else if (Name[9] == 'd' && VecWidth == 256)
@@ -2068,7 +2068,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
       IID = Intrinsic::x86_avx512_conflict_q_512;
     else
       llvm_unreachable("Unexpected intrinsic");
-  } else if (Name.startswith("pavg.")) {
+  } else if (Name.starts_with("pavg.")) {
     if (Name[5] == 'b' && VecWidth == 128)
       IID = Intrinsic::x86_sse2_pavg_b;
     else if (Name[5] == 'b' && VecWidth == 256)
@@ -2204,8 +2204,8 @@ static Value *UpgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
 static Value *UpgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
                                          Function *F, IRBuilder<> &Builder) {
-  const bool IsInc = Name.startswith("atomic.inc.");
-  if (IsInc || Name.startswith("atomic.dec.")) {
+  const bool IsInc = Name.starts_with("atomic.inc.");
+  if (IsInc || Name.starts_with("atomic.dec.")) {
     if (CI->getNumOperands() != 6) // Malformed bitcode.
       return nullptr;
 
@@ -2256,23 +2256,23 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     // Get the Function's name.
     StringRef Name = F->getName();
 
-    assert(Name.startswith("llvm.") && "Intrinsic doesn't start with 'llvm.'");
+    assert(Name.starts_with("llvm.") && "Intrinsic doesn't start with 'llvm.'");
     Name = Name.substr(5);
 
-    bool IsX86 = Name.startswith("x86.");
+    bool IsX86 = Name.starts_with("x86.");
     if (IsX86)
       Name = Name.substr(4);
-    bool IsNVVM = Name.startswith("nvvm.");
+    bool IsNVVM = Name.starts_with("nvvm.");
     if (IsNVVM)
       Name = Name.substr(5);
-    bool IsARM = Name.startswith("arm.");
+    bool IsARM = Name.starts_with("arm.");
     if (IsARM)
       Name = Name.substr(4);
-    bool IsAMDGCN = Name.startswith("amdgcn.");
+    bool IsAMDGCN = Name.starts_with("amdgcn.");
     if (IsAMDGCN)
       Name = Name.substr(7);
 
-    if (IsX86 && Name.startswith("sse4a.movnt.")) {
+    if (IsX86 && Name.starts_with("sse4a.movnt.")) {
       SmallVector<Metadata *, 1> Elts;
       Elts.push_back(
           ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
@@ -2297,8 +2297,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       return;
     }
 
-    if (IsX86 && (Name.startswith("avx.movnt.") ||
-                  Name.startswith("avx512.storent."))) {
+    if (IsX86 && (Name.starts_with("avx.movnt.") ||
+                  Name.starts_with("avx512.storent."))) {
       SmallVector<Metadata *, 1> Elts;
       Elts.push_back(
           ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
@@ -2338,9 +2338,9 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       return;
     }
 
-    if (IsX86 && (Name.startswith("sse.storeu.") ||
-                  Name.startswith("sse2.storeu.") ||
-                  Name.startswith("avx.storeu."))) {
+    if (IsX86 && (Name.starts_with("sse.storeu.") ||
+                  Name.starts_with("sse2.storeu.") ||
+                  Name.starts_with("avx.storeu."))) {
       Value *Arg0 = CI->getArgOperand(0);
       Value *Arg1 = CI->getArgOperand(1);
 
@@ -2364,7 +2364,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       return;
     }
 
-    if (IsX86 && (Name.startswith("avx512.mask.store"))) {
+    if (IsX86 && (Name.starts_with("avx512.mask.store"))) {
       // "avx512.mask.storeu." or "avx512.mask.store."
       bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
       UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
@@ -2377,14 +2377,14 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
     Value *Rep;
     // Upgrade packed integer vector compare intrinsics to compare instructions.
-    if (IsX86 && (Name.startswith("sse2.pcmp") ||
-                  Name.startswith("avx2.pcmp"))) {
+    if (IsX86 && (Name.starts_with("sse2.pcmp") ||
+                  Name.starts_with("avx2.pcmp"))) {
       // "sse2.pcpmpeq." "sse2.pcmpgt." "avx2.pcmpeq." or "avx2.pcmpgt."
       bool CmpEq = Name[9] == 'e';
       Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
                                CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = Builder.CreateSExt(Rep, CI->getType(), "");
-    } else if (IsX86 && (Name.startswith("avx512.broadcastm"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.broadcastm"))) {
       Type *ExtTy = Type::getInt32Ty(C);
       if (CI->getOperand(0)->getType()->isIntegerTy(8))
         ExtTy = Type::getInt64Ty(C);
@@ -2400,14 +2400,14 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                                  Intrinsic::sqrt, Elt0->getType());
       Elt0 = Builder.CreateCall(Intr, Elt0);
       Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0);
-    } else if (IsX86 && (Name.startswith("avx.sqrt.p") ||
-                         Name.startswith("sse2.sqrt.p") ||
-                         Name.startswith("sse.sqrt.p"))) {
+    } else if (IsX86 && (Name.starts_with("avx.sqrt.p") ||
+                         Name.starts_with("sse2.sqrt.p") ||
+                         Name.starts_with("sse.sqrt.p"))) {
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
                                                          Intrinsic::sqrt,
                                                          CI->getType()),
                                {CI->getArgOperand(0)});
-    } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.p"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.sqrt.p"))) {
       if (CI->arg_size() == 4 &&
           (!isa<ConstantInt>(CI->getArgOperand(3)) ||
            cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
@@ -2425,8 +2425,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx512.ptestm") ||
-                         Name.startswith("avx512.ptestnm"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.ptestm") ||
+                         Name.starts_with("avx512.ptestnm"))) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       Value *Mask = CI->getArgOperand(2);
@@ -2434,16 +2434,16 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       llvm::Type *Ty = Op0->getType();
       Value *Zero = llvm::Constant::getNullValue(Ty);
       ICmpInst::Predicate Pred =
-        Name.startswith("avx512.ptestm") ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+        Name.starts_with("avx512.ptestm") ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
       Rep = Builder.CreateICmp(Pred, Rep, Zero);
       Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, Mask);
-    } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){
+    } else if (IsX86 && (Name.starts_with("avx512.mask.pbroadcast"))){
       unsigned NumElts = cast<FixedVectorType>(CI->getArgOperand(1)->getType())
                              ->getNumElements();
       Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx512.kunpck"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.kunpck"))) {
       unsigned NumElts = CI->getType()->getScalarSizeInBits();
       Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), NumElts);
       Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), NumElts);
@@ -2525,11 +2525,11 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
         EltOp = Builder.CreateFDiv(Elt0, Elt1);
       Rep = Builder.CreateInsertElement(CI->getArgOperand(0), EltOp,
                                         ConstantInt::get(I32Ty, 0));
-    } else if (IsX86 && Name.startswith("avx512.mask.pcmp")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.pcmp")) {
       // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
       bool CmpEq = Name[16] == 'e';
       Rep = upgradeMaskedCompare(Builder, *CI, CmpEq ? 0 : 6, true);
-    } else if (IsX86 && Name.startswith("avx512.mask.vpshufbitqmb.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.vpshufbitqmb.")) {
       Type *OpTy = CI->getArgOperand(0)->getType();
       unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
       Intrinsic::ID IID;
@@ -2543,7 +2543,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                { CI->getOperand(0), CI->getArgOperand(1) });
       Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.fpclass.p")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.fpclass.p")) {
       Type *OpTy = CI->getArgOperand(0)->getType();
       unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
       unsigned EltWidth = OpTy->getScalarSizeInBits();
@@ -2566,7 +2566,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                { CI->getOperand(0), CI->getArgOperand(1) });
       Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.cmp.p")) {
+    } else if (IsX86 && Name.starts_with("avx512.cmp.p")) {
       SmallVector<Value *, 4> Args(CI->args());
       Type *OpTy = Args[0]->getType();
       unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
@@ -2594,17 +2594,17 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                Args);
-    } else if (IsX86 && Name.startswith("avx512.mask.cmp.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.cmp.")) {
       // Integer compare intrinsics.
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
       Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
-    } else if (IsX86 && Name.startswith("avx512.mask.ucmp.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.ucmp.")) {
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
       Rep = upgradeMaskedCompare(Builder, *CI, Imm, false);
-    } else if (IsX86 && (Name.startswith("avx512.cvtb2mask.") ||
-                         Name.startswith("avx512.cvtw2mask.") ||
-                         Name.startswith("avx512.cvtd2mask.") ||
-                         Name.startswith("avx512.cvtq2mask."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.cvtb2mask.") ||
+                         Name.starts_with("avx512.cvtw2mask.") ||
+                         Name.starts_with("avx512.cvtd2mask.") ||
+                         Name.starts_with("avx512.cvtq2mask."))) {
       Value *Op = CI->getArgOperand(0);
       Value *Zero = llvm::Constant::getNullValue(Op->getType());
       Rep = Builder.CreateICmp(ICmpInst::ICMP_SLT, Op, Zero);
@@ -2612,42 +2612,42 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     } else if(IsX86 && (Name == "ssse3.pabs.b.128" ||
                         Name == "ssse3.pabs.w.128" ||
                         Name == "ssse3.pabs.d.128" ||
-                        Name.startswith("avx2.pabs") ||
-                        Name.startswith("avx512.mask.pabs"))) {
+                        Name.starts_with("avx2.pabs") ||
+                        Name.starts_with("avx512.mask.pabs"))) {
       Rep = upgradeAbs(Builder, *CI);
     } else if (IsX86 && (Name == "sse41.pmaxsb" ||
                          Name == "sse2.pmaxs.w" ||
                          Name == "sse41.pmaxsd" ||
-                         Name.startswith("avx2.pmaxs") ||
-                         Name.startswith("avx512.mask.pmaxs"))) {
+                         Name.starts_with("avx2.pmaxs") ||
+                         Name.starts_with("avx512.mask.pmaxs"))) {
       Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smax);
     } else if (IsX86 && (Name == "sse2.pmaxu.b" ||
                          Name == "sse41.pmaxuw" ||
                          Name == "sse41.pmaxud" ||
-                         Name.startswith("avx2.pmaxu") ||
-                         Name.startswith("avx512.mask.pmaxu"))) {
+                         Name.starts_with("avx2.pmaxu") ||
+                         Name.starts_with("avx512.mask.pmaxu"))) {
       Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umax);
     } else if (IsX86 && (Name == "sse41.pminsb" ||
                          Name == "sse2.pmins.w" ||
                          Name == "sse41.pminsd" ||
-                         Name.startswith("avx2.pmins") ||
-                         Name.startswith("avx512.mask.pmins"))) {
+                         Name.starts_with("avx2.pmins") ||
+                         Name.starts_with("avx512.mask.pmins"))) {
       Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smin);
     } else if (IsX86 && (Name == "sse2.pminu.b" ||
                          Name == "sse41.pminuw" ||
                          Name == "sse41.pminud" ||
-                         Name.startswith("avx2.pminu") ||
-                         Name.startswith("avx512.mask.pminu"))) {
+                         Name.starts_with("avx2.pminu") ||
+                         Name.starts_with("avx512.mask.pminu"))) {
       Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umin);
     } else if (IsX86 && (Name == "sse2.pmulu.dq" ||
                          Name == "avx2.pmulu.dq" ||
                          Name == "avx512.pmulu.dq.512" ||
-                         Name.startswith("avx512.mask.pmulu.dq."))) {
+                         Name.starts_with("avx512.mask.pmulu.dq."))) {
       Rep = upgradePMULDQ(Builder, *CI, /*Signed*/false);
     } else if (IsX86 && (Name == "sse41.pmuldq" ||
                          Name == "avx2.pmul.dq" ||
                          Name == "avx512.pmul.dq.512" ||
-                         Name.startswith("avx512.mask.pmul.dq."))) {
+                         Name.starts_with("avx512.mask.pmul.dq."))) {
       Rep = upgradePMULDQ(Builder, *CI, /*Signed*/true);
     } else if (IsX86 && (Name == "sse.cvtsi2ss" ||
                          Name == "sse2.cvtsi2sd" ||
@@ -2671,12 +2671,12 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                          Name == "sse2.cvtdq2ps" ||
                          Name == "avx.cvtdq2.pd.256" ||
                          Name == "avx.cvtdq2.ps.256" ||
-                         Name.startswith("avx512.mask.cvtdq2pd.") ||
-                         Name.startswith("avx512.mask.cvtudq2pd.") ||
-                         Name.startswith("avx512.mask.cvtdq2ps.") ||
-                         Name.startswith("avx512.mask.cvtudq2ps.") ||
-                         Name.startswith("avx512.mask.cvtqq2pd.") ||
-                         Name.startswith("avx512.mask.cvtuqq2pd.") ||
+                         Name.starts_with("avx512.mask.cvtdq2pd.") ||
+                         Name.starts_with("avx512.mask.cvtudq2pd.") ||
+                         Name.starts_with("avx512.mask.cvtdq2ps.") ||
+                         Name.starts_with("avx512.mask.cvtudq2ps.") ||
+                         Name.starts_with("avx512.mask.cvtqq2pd.") ||
+                         Name.starts_with("avx512.mask.cvtuqq2pd.") ||
                          Name == "avx512.mask.cvtqq2ps.256" ||
                          Name == "avx512.mask.cvtqq2ps.512" ||
                          Name == "avx512.mask.cvtuqq2ps.256" ||
@@ -2715,8 +2715,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       if (CI->arg_size() >= 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx512.mask.vcvtph2ps.") ||
-                         Name.startswith("vcvtph2ps."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.vcvtph2ps.") ||
+                         Name.starts_with("vcvtph2ps."))) {
       auto *DstTy = cast<FixedVectorType>(CI->getType());
       Rep = CI->getArgOperand(0);
       auto *SrcTy = cast<FixedVectorType>(Rep->getType());
@@ -2731,13 +2731,13 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       if (CI->arg_size() >= 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
-    } else if (IsX86 && Name.startswith("avx512.mask.load")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.load")) {
       // "avx512.mask.loadu." or "avx512.mask.load."
       bool Aligned = Name[16] != 'u'; // "avx512.mask.loadu".
       Rep =
           UpgradeMaskedLoad(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
                             CI->getArgOperand(2), Aligned);
-    } else if (IsX86 && Name.startswith("avx512.mask.expand.load.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.expand.load.")) {
       auto *ResultTy = cast<FixedVectorType>(CI->getType());
       Type *PtrTy = ResultTy->getElementType();
 
@@ -2752,7 +2752,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                                 Intrinsic::masked_expandload,
                                                 ResultTy);
       Rep = Builder.CreateCall(ELd, { Ptr, MaskVec, CI->getOperand(1) });
-    } else if (IsX86 && Name.startswith("avx512.mask.compress.store.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.compress.store.")) {
       auto *ResultTy = cast<VectorType>(CI->getArgOperand(1)->getType());
       Type *PtrTy = ResultTy->getElementType();
 
@@ -2768,8 +2768,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                                 Intrinsic::masked_compressstore,
                                                 ResultTy);
       Rep = Builder.CreateCall(CSt, { CI->getArgOperand(1), Ptr, MaskVec });
-    } else if (IsX86 && (Name.startswith("avx512.mask.compress.") ||
-                         Name.startswith("avx512.mask.expand."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.compress.") ||
+                         Name.starts_with("avx512.mask.expand."))) {
       auto *ResultTy = cast<FixedVectorType>(CI->getType());
 
       Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
@@ -2781,13 +2781,13 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Function *Intr = Intrinsic::getDeclaration(F->getParent(), IID, ResultTy);
       Rep = Builder.CreateCall(Intr, { CI->getOperand(0), CI->getOperand(1),
                                        MaskVec });
-    } else if (IsX86 && Name.startswith("xop.vpcom")) {
+    } else if (IsX86 && Name.starts_with("xop.vpcom")) {
       bool IsSigned;
-      if (Name.endswith("ub") || Name.endswith("uw") || Name.endswith("ud") ||
-          Name.endswith("uq"))
+      if (Name.ends_with("ub") || Name.ends_with("uw") || Name.ends_with("ud") ||
+          Name.ends_with("uq"))
         IsSigned = false;
-      else if (Name.endswith("b") || Name.endswith("w") || Name.endswith("d") ||
-               Name.endswith("q"))
+      else if (Name.ends_with("b") || Name.ends_with("w") || Name.ends_with("d") ||
+               Name.ends_with("q"))
         IsSigned = true;
       else
         llvm_unreachable("Unknown suffix");
@@ -2797,48 +2797,48 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
         Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
       } else {
         Name = Name.substr(9); // strip off "xop.vpcom"
-        if (Name.startswith("lt"))
+        if (Name.starts_with("lt"))
           Imm = 0;
-        else if (Name.startswith("le"))
+        else if (Name.starts_with("le"))
           Imm = 1;
-        else if (Name.startswith("gt"))
+        else if (Name.starts_with("gt"))
           Imm = 2;
-        else if (Name.startswith("ge"))
+        else if (Name.starts_with("ge"))
           Imm = 3;
-        else if (Name.startswith("eq"))
+        else if (Name.starts_with("eq"))
           Imm = 4;
-        else if (Name.startswith("ne"))
+        else if (Name.starts_with("ne"))
           Imm = 5;
-        else if (Name.startswith("false"))
+        else if (Name.starts_with("false"))
           Imm = 6;
-        else if (Name.startswith("true"))
+        else if (Name.starts_with("true"))
           Imm = 7;
         else
           llvm_unreachable("Unknown condition");
       }
 
       Rep = upgradeX86vpcom(Builder, *CI, Imm, IsSigned);
-    } else if (IsX86 && Name.startswith("xop.vpcmov")) {
+    } else if (IsX86 && Name.starts_with("xop.vpcmov")) {
       Value *Sel = CI->getArgOperand(2);
       Value *NotSel = Builder.CreateNot(Sel);
       Value *Sel0 = Builder.CreateAnd(CI->getArgOperand(0), Sel);
       Value *Sel1 = Builder.CreateAnd(CI->getArgOperand(1), NotSel);
       Rep = Builder.CreateOr(Sel0, Sel1);
-    } else if (IsX86 && (Name.startswith("xop.vprot") ||
-                         Name.startswith("avx512.prol") ||
-                         Name.startswith("avx512.mask.prol"))) {
+    } else if (IsX86 && (Name.starts_with("xop.vprot") ||
+                         Name.starts_with("avx512.prol") ||
+                         Name.starts_with("avx512.mask.prol"))) {
       Rep = upgradeX86Rotate(Builder, *CI, false);
-    } else if (IsX86 && (Name.startswith("avx512.pror") ||
-                         Name.startswith("avx512.mask.pror"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.pror") ||
+                         Name.starts_with("avx512.mask.pror"))) {
       Rep = upgradeX86Rotate(Builder, *CI, true);
-    } else if (IsX86 && (Name.startswith("avx512.vpshld.") ||
-                         Name.startswith("avx512.mask.vpshld") ||
-                         Name.startswith("avx512.maskz.vpshld"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.vpshld.") ||
+                         Name.starts_with("avx512.mask.vpshld") ||
+                         Name.starts_with("avx512.maskz.vpshld"))) {
       bool ZeroMask = Name[11] == 'z';
       Rep = upgradeX86ConcatShift(Builder, *CI, false, ZeroMask);
-    } else if (IsX86 && (Name.startswith("avx512.vpshrd.") ||
-                         Name.startswith("avx512.mask.vpshrd") ||
-                         Name.startswith("avx512.maskz.vpshrd"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.vpshrd.") ||
+                         Name.starts_with("avx512.mask.vpshrd") ||
+                         Name.starts_with("avx512.maskz.vpshrd"))) {
       bool ZeroMask = Name[11] == 'z';
       Rep = upgradeX86ConcatShift(Builder, *CI, true, ZeroMask);
     } else if (IsX86 && Name == "sse42.crc32.64.8") {
@@ -2847,8 +2847,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
       Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});
       Rep = Builder.CreateZExt(Rep, CI->getType(), "");
-    } else if (IsX86 && (Name.startswith("avx.vbroadcast.s") ||
-                         Name.startswith("avx512.vbroadcast.s"))) {
+    } else if (IsX86 && (Name.starts_with("avx.vbroadcast.s") ||
+                         Name.starts_with("avx512.vbroadcast.s"))) {
       // Replace broadcasts with a series of insertelements.
       auto *VecTy = cast<FixedVectorType>(CI->getType());
       Type *EltTy = VecTy->getElementType();
@@ -2859,12 +2859,12 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       for (unsigned I = 0; I < EltNum; ++I)
         Rep = Builder.CreateInsertElement(Rep, Load,
                                           ConstantInt::get(I32Ty, I));
-    } else if (IsX86 && (Name.startswith("sse41.pmovsx") ||
-                         Name.startswith("sse41.pmovzx") ||
-                         Name.startswith("avx2.pmovsx") ||
-                         Name.startswith("avx2.pmovzx") ||
-                         Name.startswith("avx512.mask.pmovsx") ||
-                         Name.startswith("avx512.mask.pmovzx"))) {
+    } else if (IsX86 && (Name.starts_with("sse41.pmovsx") ||
+                         Name.starts_with("sse41.pmovzx") ||
+                         Name.starts_with("avx2.pmovsx") ||
+                         Name.starts_with("avx2.pmovzx") ||
+                         Name.starts_with("avx512.mask.pmovsx") ||
+                         Name.starts_with("avx512.mask.pmovzx"))) {
       auto *DstTy = cast<FixedVectorType>(CI->getType());
       unsigned NumDstElts = DstTy->getNumElements();
 
@@ -2891,7 +2891,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateTrunc(CI->getArgOperand(0), Ty);
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx.vbroadcastf128") ||
+    } else if (IsX86 && (Name.starts_with("avx.vbroadcastf128") ||
                          Name == "avx2.vbroadcasti128")) {
       // Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle.
       Type *EltTy = cast<VectorType>(CI->getType())->getElementType();
@@ -2905,8 +2905,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       else
         Rep = Builder.CreateShuffleVector(
             Load, ArrayRef<int>{0, 1, 2, 3, 0, 1, 2, 3});
-    } else if (IsX86 && (Name.startswith("avx512.mask.shuf.i") ||
-                         Name.startswith("avx512.mask.shuf.f"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.shuf.i") ||
+                         Name.starts_with("avx512.mask.shuf.f"))) {
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
       Type *VT = CI->getType();
       unsigned NumLanes = VT->getPrimitiveSizeInBits() / 128;
@@ -2927,8 +2927,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                         CI->getArgOperand(1), ShuffleMask);
       Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
                           CI->getArgOperand(3));
-    }else if (IsX86 && (Name.startswith("avx512.mask.broadcastf") ||
-                         Name.startswith("avx512.mask.broadcasti"))) {
+    }else if (IsX86 && (Name.starts_with("avx512.mask.broadcastf") ||
+                         Name.starts_with("avx512.mask.broadcasti"))) {
       unsigned NumSrcElts =
           cast<FixedVectorType>(CI->getArgOperand(0)->getType())
               ->getNumElements();
@@ -2944,10 +2944,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                         ShuffleMask);
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx2.pbroadcast") ||
-                         Name.startswith("avx2.vbroadcast") ||
-                         Name.startswith("avx512.pbroadcast") ||
-                         Name.startswith("avx512.mask.broadcast.s"))) {
+    } else if (IsX86 && (Name.starts_with("avx2.pbroadcast") ||
+                         Name.starts_with("avx2.vbroadcast") ||
+                         Name.starts_with("avx512.pbroadcast") ||
+                         Name.starts_with("avx512.mask.broadcast.s"))) {
       // Replace vp?broadcasts with a vector shuffle.
       Value *Op = CI->getArgOperand(0);
       ElementCount EC = cast<VectorType>(CI->getType())->getElementCount();
@@ -2959,32 +2959,32 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       if (CI->arg_size() == 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("sse2.padds.") ||
-                         Name.startswith("avx2.padds.") ||
-                         Name.startswith("avx512.padds.") ||
-                         Name.startswith("avx512.mask.padds."))) {
+    } else if (IsX86 && (Name.starts_with("sse2.padds.") ||
+                         Name.starts_with("avx2.padds.") ||
+                         Name.starts_with("avx512.padds.") ||
+                         Name.starts_with("avx512.mask.padds."))) {
       Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::sadd_sat);
-    } else if (IsX86 && (Name.startswith("sse2.psubs.") ||
-                         Name.startswith("avx2.psubs.") ||
-                         Name.startswith("avx512.psubs.") ||
-                         Name.startswith("avx512.mask.psubs."))) {
+    } else if (IsX86 && (Name.starts_with("sse2.psubs.") ||
+                         Name.starts_with("avx2.psubs.") ||
+                         Name.starts_with("avx512.psubs.") ||
+                         Name.starts_with("avx512.mask.psubs."))) {
       Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::ssub_sat);
-    } else if (IsX86 && (Name.startswith("sse2.paddus.") ||
-                         Name.startswith("avx2.paddus.") ||
-                         Name.startswith("avx512.mask.paddus."))) {
+    } else if (IsX86 && (Name.starts_with("sse2.paddus.") ||
+                         Name.starts_with("avx2.paddus.") ||
+                         Name.starts_with("avx512.mask.paddus."))) {
       Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::uadd_sat);
-    } else if (IsX86 && (Name.startswith("sse2.psubus.") ||
-                         Name.startswith("avx2.psubus.") ||
-                         Name.startswith("avx512.mask.psubus."))) {
+    } else if (IsX86 && (Name.starts_with("sse2.psubus.") ||
+                         Name.starts_with("avx2.psubus.") ||
+                         Name.starts_with("avx512.mask.psubus."))) {
       Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::usub_sat);
-    } else if (IsX86 && Name.startswith("avx512.mask.palignr.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.palignr.")) {
       Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
                                       CI->getArgOperand(1),
                                       CI->getArgOperand(2),
                                       CI->getArgOperand(3),
                                       CI->getArgOperand(4),
                                       false);
-    } else if (IsX86 && Name.startswith("avx512.mask.valign.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.valign.")) {
       Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
                                       CI->getArgOperand(1),
                                       CI->getArgOperand(2),
@@ -3016,10 +3016,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
       Rep = UpgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
     } else if (IsX86 && (Name == "sse41.pblendw" ||
-                         Name.startswith("sse41.blendp") ||
-                         Name.startswith("avx.blend.p") ||
+                         Name.starts_with("sse41.blendp") ||
+                         Name.starts_with("avx.blend.p") ||
                          Name == "avx2.pblendw" ||
-                         Name.startswith("avx2.pblendd."))) {
+                         Name.starts_with("avx2.pblendd."))) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       unsigned Imm = cast <ConstantInt>(CI->getArgOperand(2))->getZExtValue();
@@ -3031,9 +3031,9 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
         Idxs[i] = ((Imm >> (i%8)) & 1) ? i + NumElts : i;
 
       Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
-    } else if (IsX86 && (Name.startswith("avx.vinsertf128.") ||
+    } else if (IsX86 && (Name.starts_with("avx.vinsertf128.") ||
                          Name == "avx2.vinserti128" ||
-                         Name.startswith("avx512.mask.insert"))) {
+                         Name.starts_with("avx512.mask.insert"))) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
@@ -3077,9 +3077,9 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       if (CI->arg_size() == 5)
         Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
                             CI->getArgOperand(3));
-    } else if (IsX86 && (Name.startswith("avx.vextractf128.") ||
+    } else if (IsX86 && (Name.starts_with("avx.vextractf128.") ||
                          Name == "avx2.vextracti128" ||
-                         Name.startswith("avx512.mask.vextract"))) {
+                         Name.starts_with("avx512.mask.vextract"))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
       unsigned DstNumElts =
@@ -3104,8 +3104,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                             CI->getArgOperand(2));
     } else if (!IsX86 && Name == "stackprotectorcheck") {
       Rep = nullptr;
-    } else if (IsX86 && (Name.startswith("avx512.mask.perm.df.") ||
-                         Name.startswith("avx512.mask.perm.di."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.perm.df.") ||
+                         Name.starts_with("avx512.mask.perm.di."))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
       auto *VecTy = cast<FixedVectorType>(CI->getType());
@@ -3120,7 +3120,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       if (CI->arg_size() == 4)
         Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
-    } else if (IsX86 && (Name.startswith("avx.vperm2f128.") ||
+    } else if (IsX86 && (Name.starts_with("avx.vperm2f128.") ||
                          Name == "avx2.vperm2i128")) {
       // The immediate permute control byte looks like this:
       //    [1:0] - select 128 bits from sources for low half of destination
@@ -3156,10 +3156,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
 
-    } else if (IsX86 && (Name.startswith("avx.vpermil.") ||
+    } else if (IsX86 && (Name.starts_with("avx.vpermil.") ||
                          Name == "sse2.pshuf.d" ||
-                         Name.startswith("avx512.mask.vpermil.p") ||
-                         Name.startswith("avx512.mask.pshuf.d."))) {
+                         Name.starts_with("avx512.mask.vpermil.p") ||
+                         Name.starts_with("avx512.mask.pshuf.d."))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
       auto *VecTy = cast<FixedVectorType>(CI->getType());
@@ -3181,7 +3181,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
         Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (IsX86 && (Name == "sse2.pshufl.w" ||
-                         Name.startswith("avx512.mask.pshufl.w."))) {
+                         Name.starts_with("avx512.mask.pshufl.w."))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
       unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
@@ -3200,7 +3200,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
         Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (IsX86 && (Name == "sse2.pshufh.w" ||
-                         Name.startswith("avx512.mask.pshufh.w."))) {
+                         Name.starts_with("avx512.mask.pshufh.w."))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
       unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
@@ -3218,7 +3218,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       if (CI->arg_size() == 4)
         Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.shuf.p")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.shuf.p")) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
@@ -3243,15 +3243,15 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
                           CI->getArgOperand(3));
-    } else if (IsX86 && (Name.startswith("avx512.mask.movddup") ||
-                         Name.startswith("avx512.mask.movshdup") ||
-                         Name.startswith("avx512.mask.movsldup"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.movddup") ||
+                         Name.starts_with("avx512.mask.movshdup") ||
+                         Name.starts_with("avx512.mask.movsldup"))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
       unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
 
       unsigned Offset = 0;
-      if (Name.startswith("avx512.mask.movshdup."))
+      if (Name.starts_with("avx512.mask.movshdup."))
         Offset = 1;
 
       SmallVector<int, 16> Idxs(NumElts);
@@ -3265,8 +3265,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx512.mask.punpckl") ||
-                         Name.startswith("avx512.mask.unpckl."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.punpckl") ||
+                         Name.starts_with("avx512.mask.unpckl."))) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
@@ -3281,8 +3281,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && (Name.startswith("avx512.mask.punpckh") ||
-                         Name.startswith("avx512.mask.unpckh."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.punpckh") ||
+                         Name.starts_with("avx512.mask.unpckh."))) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
@@ -3297,8 +3297,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && (Name.startswith("avx512.mask.and.") ||
-                         Name.startswith("avx512.mask.pand."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.and.") ||
+                         Name.starts_with("avx512.mask.pand."))) {
       VectorType *FTy = cast<VectorType>(CI->getType());
       VectorType *ITy = VectorType::getInteger(FTy);
       Rep = Builder.CreateAnd(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
@@ -3306,8 +3306,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateBitCast(Rep, FTy);
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && (Name.startswith("avx512.mask.andn.") ||
-                         Name.startswith("avx512.mask.pandn."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.andn.") ||
+                         Name.starts_with("avx512.mask.pandn."))) {
       VectorType *FTy = cast<VectorType>(CI->getType());
       VectorType *ITy = VectorType::getInteger(FTy);
       Rep = Builder.CreateNot(Builder.CreateBitCast(CI->getArgOperand(0), ITy));
@@ -3316,8 +3316,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateBitCast(Rep, FTy);
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && (Name.startswith("avx512.mask.or.") ||
-                         Name.startswith("avx512.mask.por."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.or.") ||
+                         Name.starts_with("avx512.mask.por."))) {
       VectorType *FTy = cast<VectorType>(CI->getType());
       VectorType *ITy = VectorType::getInteger(FTy);
       Rep = Builder.CreateOr(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
@@ -3325,8 +3325,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateBitCast(Rep, FTy);
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && (Name.startswith("avx512.mask.xor.") ||
-                         Name.startswith("avx512.mask.pxor."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.xor.") ||
+                         Name.starts_with("avx512.mask.pxor."))) {
       VectorType *FTy = cast<VectorType>(CI->getType());
       VectorType *ITy = VectorType::getInteger(FTy);
       Rep = Builder.CreateXor(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
@@ -3334,20 +3334,20 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateBitCast(Rep, FTy);
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.padd.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.padd.")) {
       Rep = Builder.CreateAdd(CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.psub.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.psub.")) {
       Rep = Builder.CreateSub(CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.pmull.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.pmull.")) {
       Rep = Builder.CreateMul(CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.add.p")) {
-      if (Name.endswith(".512")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.add.p")) {
+      if (Name.ends_with(".512")) {
         Intrinsic::ID IID;
         if (Name[17] == 's')
           IID = Intrinsic::x86_avx512_add_ps_512;
@@ -3362,8 +3362,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.div.p")) {
-      if (Name.endswith(".512")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.div.p")) {
+      if (Name.ends_with(".512")) {
         Intrinsic::ID IID;
         if (Name[17] == 's')
           IID = Intrinsic::x86_avx512_div_ps_512;
@@ -3378,8 +3378,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.mul.p")) {
-      if (Name.endswith(".512")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.mul.p")) {
+      if (Name.ends_with(".512")) {
         Intrinsic::ID IID;
         if (Name[17] == 's')
           IID = Intrinsic::x86_avx512_mul_ps_512;
@@ -3394,8 +3394,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.sub.p")) {
-      if (Name.endswith(".512")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.sub.p")) {
+      if (Name.ends_with(".512")) {
         Intrinsic::ID IID;
         if (Name[17] == 's')
           IID = Intrinsic::x86_avx512_sub_ps_512;
@@ -3410,8 +3410,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && (Name.startswith("avx512.mask.max.p") ||
-                         Name.startswith("avx512.mask.min.p")) &&
+    } else if (IsX86 && (Name.starts_with("avx512.mask.max.p") ||
+                         Name.starts_with("avx512.mask.min.p")) &&
                Name.drop_front(18) == ".512") {
       bool IsDouble = Name[17] == 'd';
       bool IsMin = Name[13] == 'i';
@@ -3426,14 +3426,14 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                  CI->getArgOperand(4) });
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
-    } else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.lzcnt.")) {
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
                                                          Intrinsic::ctlz,
                                                          CI->getType()),
                                { CI->getArgOperand(0), Builder.getInt1(false) });
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
-    } else if (IsX86 && Name.startswith("avx512.mask.psll")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.psll")) {
       bool IsImmediate = Name[16] == 'i' ||
                          (Name.size() > 18 && Name[18] == 'i');
       bool IsVariable = Name[16] == 'v';
@@ -3460,7 +3460,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
           IID = Intrinsic::x86_avx512_psllv_w_512;
         else
           llvm_unreachable("Unexpected size");
-      } else if (Name.endswith(".128")) {
+      } else if (Name.ends_with(".128")) {
         if (Size == 'd') // avx512.mask.psll.d.128, avx512.mask.psll.di.128
           IID = IsImmediate ? Intrinsic::x86_sse2_pslli_d
                             : Intrinsic::x86_sse2_psll_d;
@@ -3472,7 +3472,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                             : Intrinsic::x86_sse2_psll_w;
         else
           llvm_unreachable("Unexpected size");
-      } else if (Name.endswith(".256")) {
+      } else if (Name.ends_with(".256")) {
         if (Size == 'd') // avx512.mask.psll.d.256, avx512.mask.psll.di.256
           IID = IsImmediate ? Intrinsic::x86_avx2_pslli_d
                             : Intrinsic::x86_avx2_psll_d;
@@ -3501,7 +3501,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
 
       Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
-    } else if (IsX86 && Name.startswith("avx512.mask.psrl")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.psrl")) {
       bool IsImmediate = Name[16] == 'i' ||
                          (Name.size() > 18 && Name[18] == 'i');
       bool IsVariable = Name[16] == 'v';
@@ -3528,7 +3528,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
           IID = Intrinsic::x86_avx512_psrlv_w_512;
         else
           llvm_unreachable("Unexpected size");
-      } else if (Name.endswith(".128")) {
+      } else if (Name.ends_with(".128")) {
         if (Size == 'd') // avx512.mask.psrl.d.128, avx512.mask.psrl.di.128
           IID = IsImmediate ? Intrinsic::x86_sse2_psrli_d
                             : Intrinsic::x86_sse2_psrl_d;
@@ -3540,7 +3540,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                             : Intrinsic::x86_sse2_psrl_w;
         else
           llvm_unreachable("Unexpected size");
-      } else if (Name.endswith(".256")) {
+      } else if (Name.ends_with(".256")) {
         if (Size == 'd') // avx512.mask.psrl.d.256, avx512.mask.psrl.di.256
           IID = IsImmediate ? Intrinsic::x86_avx2_psrli_d
                             : Intrinsic::x86_avx2_psrl_d;
@@ -3569,7 +3569,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
 
       Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
-    } else if (IsX86 && Name.startswith("avx512.mask.psra")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.psra")) {
       bool IsImmediate = Name[16] == 'i' ||
                          (Name.size() > 18 && Name[18] == 'i');
       bool IsVariable = Name[16] == 'v';
@@ -3592,7 +3592,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
           IID = Intrinsic::x86_avx512_psrav_w_512;
         else
           llvm_unreachable("Unexpected size");
-      } else if (Name.endswith(".128")) {
+      } else if (Name.ends_with(".128")) {
         if (Size == 'd') // avx512.mask.psra.d.128, avx512.mask.psra.di.128
           IID = IsImmediate ? Intrinsic::x86_sse2_psrai_d
                             : Intrinsic::x86_sse2_psra_d;
@@ -3605,7 +3605,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                             : Intrinsic::x86_sse2_psra_w;
         else
           llvm_unreachable("Unexpected size");
-      } else if (Name.endswith(".256")) {
+      } else if (Name.ends_with(".256")) {
         if (Size == 'd') // avx512.mask.psra.d.256, avx512.mask.psra.di.256
           IID = IsImmediate ? Intrinsic::x86_avx2_psrai_d
                             : Intrinsic::x86_avx2_psra_d;
@@ -3635,11 +3635,11 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
 
       Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
-    } else if (IsX86 && Name.startswith("avx512.mask.move.s")) {
+    } else if (IsX86 && Name.starts_with("avx512.mask.move.s")) {
       Rep = upgradeMaskedMove(Builder, *CI);
-    } else if (IsX86 && Name.startswith("avx512.cvtmask2")) {
+    } else if (IsX86 && Name.starts_with("avx512.cvtmask2")) {
       Rep = UpgradeMaskToInt(Builder, *CI);
-    } else if (IsX86 && Name.endswith(".movntdqa")) {
+    } else if (IsX86 && Name.ends_with(".movntdqa")) {
       MDNode *Node = MDNode::get(
           C, ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
 
@@ -3653,10 +3653,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
           Align(CI->getType()->getPrimitiveSizeInBits().getFixedValue() / 8));
       LI->setMetadata(LLVMContext::MD_nontemporal, Node);
       Rep = LI;
-    } else if (IsX86 && (Name.startswith("fma.vfmadd.") ||
-                         Name.startswith("fma.vfmsub.") ||
-                         Name.startswith("fma.vfnmadd.") ||
-                         Name.startswith("fma.vfnmsub."))) {
+    } else if (IsX86 && (Name.starts_with("fma.vfmadd.") ||
+                         Name.starts_with("fma.vfmsub.") ||
+                         Name.starts_with("fma.vfnmadd.") ||
+                         Name.starts_with("fma.vfnmsub."))) {
       bool NegMul = Name[6] == 'n';
       bool NegAcc = NegMul ? Name[8] == 's' : Name[7] == 's';
       bool IsScalar = NegMul ? Name[12] == 's' : Name[11] == 's';
@@ -3685,7 +3685,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       if (IsScalar)
         Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep,
                                           (uint64_t)0);
-    } else if (IsX86 && Name.startswith("fma4.vfmadd.s")) {
+    } else if (IsX86 && Name.starts_with("fma4.vfmadd.s")) {
       Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
                        CI->getArgOperand(2) };
 
@@ -3700,11 +3700,11 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()),
                                         Rep, (uint64_t)0);
-    } else if (IsX86 && (Name.startswith("avx512.mask.vfmadd.s") ||
-                         Name.startswith("avx512.maskz.vfmadd.s") ||
-                         Name.startswith("avx512.mask3.vfmadd.s") ||
-                         Name.startswith("avx512.mask3.vfmsub.s") ||
-                         Name.startswith("avx512.mask3.vfnmsub.s"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.vfmadd.s") ||
+                         Name.starts_with("avx512.maskz.vfmadd.s") ||
+                         Name.starts_with("avx512.mask3.vfmadd.s") ||
+                         Name.starts_with("avx512.mask3.vfmsub.s") ||
+                         Name.starts_with("avx512.mask3.vfnmsub.s"))) {
       bool IsMask3 = Name[11] == '3';
       bool IsMaskZ = Name[11] == 'z';
       // Drop the "avx512.mask." to make it easier.
@@ -3758,13 +3758,13 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                 Rep, PassThru);
       Rep = Builder.CreateInsertElement(CI->getArgOperand(IsMask3 ? 2 : 0),
                                         Rep, (uint64_t)0);
-    } else if (IsX86 && (Name.startswith("avx512.mask.vfmadd.p") ||
-                         Name.startswith("avx512.mask.vfnmadd.p") ||
-                         Name.startswith("avx512.mask.vfnmsub.p") ||
-                         Name.startswith("avx512.mask3.vfmadd.p") ||
-                         Name.startswith("avx512.mask3.vfmsub.p") ||
-                         Name.startswith("avx512.mask3.vfnmsub.p") ||
-                         Name.startswith("avx512.maskz.vfmadd.p"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.vfmadd.p") ||
+                         Name.starts_with("avx512.mask.vfnmadd.p") ||
+                         Name.starts_with("avx512.mask.vfnmsub.p") ||
+                         Name.starts_with("avx512.mask3.vfmadd.p") ||
+                         Name.starts_with("avx512.mask3.vfmsub.p") ||
+                         Name.starts_with("avx512.mask3.vfnmsub.p") ||
+                         Name.starts_with("avx512.maskz.vfmadd.p"))) {
       bool IsMask3 = Name[11] == '3';
       bool IsMaskZ = Name[11] == 'z';
       // Drop the "avx512.mask." to make it easier.
@@ -3807,7 +3807,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                   CI->getArgOperand(0);
 
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
-    } else if (IsX86 &&  Name.startswith("fma.vfmsubadd.p")) {
+    } else if (IsX86 &&  Name.starts_with("fma.vfmsubadd.p")) {
       unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
       unsigned EltWidth = CI->getType()->getScalarSizeInBits();
       Intrinsic::ID IID;
@@ -3827,10 +3827,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Ops[2] = Builder.CreateFNeg(Ops[2]);
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                Ops);
-    } else if (IsX86 && (Name.startswith("avx512.mask.vfmaddsub.p") ||
-                         Name.startswith("avx512.mask3.vfmaddsub.p") ||
-                         Name.startswith("avx512.maskz.vfmaddsub.p") ||
-                         Name.startswith("avx512.mask3.vfmsubadd.p"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.vfmaddsub.p") ||
+                         Name.starts_with("avx512.mask3.vfmaddsub.p") ||
+                         Name.starts_with("avx512.maskz.vfmaddsub.p") ||
+                         Name.starts_with("avx512.mask3.vfmsubadd.p"))) {
       bool IsMask3 = Name[11] == '3';
       bool IsMaskZ = Name[11] == 'z';
       // Drop the "avx512.mask." to make it easier.
@@ -3878,8 +3878,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                   CI->getArgOperand(0);
 
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
-    } else if (IsX86 && (Name.startswith("avx512.mask.pternlog.") ||
-                         Name.startswith("avx512.maskz.pternlog."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.pternlog.") ||
+                         Name.starts_with("avx512.maskz.pternlog."))) {
       bool ZeroMask = Name[11] == 'z';
       unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
       unsigned EltWidth = CI->getType()->getScalarSizeInBits();
@@ -3906,8 +3906,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                  : CI->getArgOperand(0);
       Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
-    } else if (IsX86 && (Name.startswith("avx512.mask.vpmadd52") ||
-                         Name.startswith("avx512.maskz.vpmadd52"))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.vpmadd52") ||
+                         Name.starts_with("avx512.maskz.vpmadd52"))) {
       bool ZeroMask = Name[11] == 'z';
       bool High = Name[20] == 'h' || Name[21] == 'h';
       unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
@@ -3934,16 +3934,16 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                  : CI->getArgOperand(0);
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
-    } else if (IsX86 && (Name.startswith("avx512.mask.vpermi2var.") ||
-                         Name.startswith("avx512.mask.vpermt2var.") ||
-                         Name.startswith("avx512.maskz.vpermt2var."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.vpermi2var.") ||
+                         Name.starts_with("avx512.mask.vpermt2var.") ||
+                         Name.starts_with("avx512.maskz.vpermt2var."))) {
       bool ZeroMask = Name[11] == 'z';
       bool IndexForm = Name[17] == 'i';
       Rep = UpgradeX86VPERMT2Intrinsics(Builder, *CI, ZeroMask, IndexForm);
-    } else if (IsX86 && (Name.startswith("avx512.mask.vpdpbusd.") ||
-                         Name.startswith("avx512.maskz.vpdpbusd.") ||
-                         Name.startswith("avx512.mask.vpdpbusds.") ||
-                         Name.startswith("avx512.maskz.vpdpbusds."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.vpdpbusd.") ||
+                         Name.starts_with("avx512.maskz.vpdpbusd.") ||
+                         Name.starts_with("avx512.mask.vpdpbusds.") ||
+                         Name.starts_with("avx512.maskz.vpdpbusds."))) {
       bool ZeroMask = Name[11] == 'z';
       bool IsSaturating = Name[ZeroMask ? 21 : 20] == 's';
       unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
@@ -3970,10 +3970,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                  : CI->getArgOperand(0);
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
-    } else if (IsX86 && (Name.startswith("avx512.mask.vpdpwssd.") ||
-                         Name.startswith("avx512.maskz.vpdpwssd.") ||
-                         Name.startswith("avx512.mask.vpdpwssds.") ||
-                         Name.startswith("avx512.maskz.vpdpwssds."))) {
+    } else if (IsX86 && (Name.starts_with("avx512.mask.vpdpwssd.") ||
+                         Name.starts_with("avx512.maskz.vpdpwssd.") ||
+                         Name.starts_with("avx512.mask.vpdpwssds.") ||
+                         Name.starts_with("avx512.maskz.vpdpwssds."))) {
       bool ZeroMask = Name[11] == 'z';
       bool IsSaturating = Name[ZeroMask ? 21 : 20] == 's';
       unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
@@ -4033,7 +4033,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       CI->replaceAllUsesWith(CF);
       Rep = nullptr;
-    } else if (IsX86 && Name.startswith("avx512.mask.") &&
+    } else if (IsX86 && Name.starts_with("avx512.mask.") &&
                upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
       // Rep will be updated by the call in the condition.
     } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) {
@@ -4042,8 +4042,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Value *Cmp = Builder.CreateICmpSGE(
           Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
       Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
-    } else if (IsNVVM && (Name.startswith("atomic.load.add.f32.p") ||
-                          Name.startswith("atomic.load.add.f64.p"))) {
+    } else if (IsNVVM && (Name.starts_with("atomic.load.add.f32.p") ||
+                          Name.starts_with("atomic.load.add.f64.p"))) {
       Value *Ptr = CI->getArgOperand(0);
       Value *Val = CI->getArgOperand(1);
       Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
@@ -4052,7 +4052,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                           Name == "max.ui" || Name == "max.ull")) {
       Value *Arg0 = CI->getArgOperand(0);
       Value *Arg1 = CI->getArgOperand(1);
-      Value *Cmp = Name.endswith(".ui") || Name.endswith(".ull")
+      Value *Cmp = Name.ends_with(".ui") || Name.ends_with(".ull")
                        ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
                        : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
       Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
@@ -4060,7 +4060,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                           Name == "min.ui" || Name == "min.ull")) {
       Value *Arg0 = CI->getArgOperand(0);
       Value *Arg1 = CI->getArgOperand(1);
-      Value *Cmp = Name.endswith(".ui") || Name.endswith(".ull")
+      Value *Cmp = Name.ends_with(".ui") || Name.ends_with(".ull")
                        ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
                        : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
       Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
@@ -4221,7 +4221,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   case Intrinsic::vector_extract: {
     StringRef Name = F->getName();
     Name = Name.substr(5); // Strip llvm
-    if (!Name.startswith("aarch64.sve.tuple.get")) {
+    if (!Name.starts_with("aarch64.sve.tuple.get")) {
       DefaultCase();
       return;
     }
@@ -4237,11 +4237,11 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   case Intrinsic::vector_insert: {
     StringRef Name = F->getName();
     Name = Name.substr(5);
-    if (!Name.startswith("aarch64.sve.tuple")) {
+    if (!Name.starts_with("aarch64.sve.tuple")) {
       DefaultCase();
       return;
     }
-    if (Name.startswith("aarch64.sve.tuple.set")) {
+    if (Name.starts_with("aarch64.sve.tuple.set")) {
       unsigned I = dyn_cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
       ScalableVectorType *Ty =
           dyn_cast<ScalableVectorType>(CI->getArgOperand(2)->getType());
@@ -4251,7 +4251,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
           NewFn, {CI->getArgOperand(0), CI->getArgOperand(2), NewIdx});
       break;
     }
-    if (Name.startswith("aarch64.sve.tuple.create")) {
+    if (Name.starts_with("aarch64.sve.tuple.create")) {
       unsigned N = StringSwitch<unsigned>(Name)
                        .StartsWith("aarch64.sve.tuple.create2", 2)
                        .StartsWith("aarch64.sve.tuple.create3", 3)
@@ -4330,7 +4330,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     StringRef Name = F->getName();
     Name = Name.substr(5); // Strip llvm.
     // Upgrade `dbg.addr` to `dbg.value` with `DW_OP_deref`.
-    if (Name.startswith("dbg.addr")) {
+    if (Name.starts_with("dbg.addr")) {
       DIExpression *Expr = cast<DIExpression>(
           cast<MetadataAsValue>(CI->getArgOperand(2))->getMetadata());
       Expr = DIExpression::append(Expr, dwarf::DW_OP_deref);
@@ -4936,7 +4936,7 @@ bool llvm::UpgradeModuleFlags(Module &M) {
     // Upgrade branch protection and return address signing module flags. The
     // module flag behavior for these fields were Error and now they are Min.
     if (ID->getString() == "branch-target-enforcement" ||
-        ID->getString().startswith("sign-return-address")) {
+        ID->getString().starts_with("sign-return-address")) {
       if (auto *Behavior =
               mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(0))) {
         if (Behavior->getLimitedValue() == Module::Error) {
@@ -5039,7 +5039,7 @@ void llvm::UpgradeSectionAttributes(Module &M) {
 
     StringRef Section = GV.getSection();
 
-    if (!Section.startswith("__DATA, __objc_catlist"))
+    if (!Section.starts_with("__DATA, __objc_catlist"))
       continue;
 
     // __DATA, __objc_catlist, regular, no_dead_strip
@@ -5099,12 +5099,12 @@ static bool isOldLoopArgument(Metadata *MD) {
   auto *S = dyn_cast_or_null<MDString>(T->getOperand(0));
   if (!S)
     return false;
-  return S->getString().startswith("llvm.vectorizer.");
+  return S->getString().starts_with("llvm.vectorizer.");
 }
 
 static MDString *upgradeLoopTag(LLVMContext &C, StringRef OldTag) {
   StringRef OldPrefix = "llvm.vectorizer.";
-  assert(OldTag.startswith(OldPrefix) && "Expected old prefix");
+  assert(OldTag.starts_with(OldPrefix) && "Expected old prefix");
 
   if (OldTag == "llvm.vectorizer.unroll")
     return MDString::get(C, "llvm.loop.interleave.count");
@@ -5123,7 +5123,7 @@ static Metadata *upgradeLoopArgument(Metadata *MD) {
   auto *OldTag = dyn_cast_or_null<MDString>(T->getOperand(0));
   if (!OldTag)
     return MD;
-  if (!OldTag->getString().startswith("llvm.vectorizer."))
+  if (!OldTag->getString().starts_with("llvm.vectorizer."))
     return MD;
 
   // This has an old tag.  Upgrade it.
@@ -5157,7 +5157,7 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
   // The only data layout upgrades needed for pre-GCN are setting the address
   // space of globals to 1.
   if (T.isAMDGPU() && !T.isAMDGCN() && !DL.contains("-G") &&
-      !DL.startswith("G")) {
+      !DL.starts_with("G")) {
     return DL.empty() ? std::string("G1") : (DL + "-G1").str();
   }
 
@@ -5179,7 +5179,7 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
     // Add missing non-integral declarations.
     // This goes before adding new address spaces to prevent incoherent string
     // values.
-    if (!DL.contains("-ni") && !DL.startswith("ni"))
+    if (!DL.contains("-ni") && !DL.starts_with("ni"))
       Res.append("-ni:7:8");
     // Update ni:7 to ni:7:8.
     if (DL.ends_with("ni:7"))
@@ -5187,9 +5187,9 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
 
     // Add sizing for address spaces 7 and 8 (fat raw buffers and buffer
     // resources) An empty data layout has already been upgraded to G1 by now.
-    if (!DL.contains("-p7") && !DL.startswith("p7"))
+    if (!DL.contains("-p7") && !DL.starts_with("p7"))
       Res.append("-p7:160:256:256:32");
-    if (!DL.contains("-p8") && !DL.startswith("p8"))
+    if (!DL.contains("-p8") && !DL.starts_with("p8"))
       Res.append("-p8:128:128");
 
     return Res;

From b65da9848b46a94c37886772609c6ef9901a9f3a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 6 Nov 2023 13:18:20 +0000
Subject: [PATCH 17/40] [AsmPrinter] Use StringRef::starts_with/ends_with
 instead of startswith/endswith. NFC.

startswith/endswith wrap starts_with/ends_with and will eventually go away (to more closely match string_view)
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    | 3 ++-
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 8 ++++----
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    | 2 +-
 llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp    | 2 +-
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index c6014cff24c10e..eb189dc1569c20 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2389,7 +2389,8 @@ bool AsmPrinter::doFinalization(Module &M) {
     OutStreamer->emitAddrsig();
     for (const GlobalValue &GV : M.global_values()) {
       if (!GV.use_empty() && !GV.isThreadLocal() &&
-          !GV.hasDLLImportStorageClass() && !GV.getName().startswith("llvm.") &&
+          !GV.hasDLLImportStorageClass() &&
+          !GV.getName().starts_with("llvm.") &&
           !GV.hasAtLeastLocalUnnamedAddr())
         OutStreamer->emitAddrsigSym(getSymbol(&GV));
     }
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 06dcf0a4d463b4..3a9d5fa3b936e0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -142,7 +142,7 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
 
   // If this is a Unix-style path, just use it as is. Don't try to canonicalize
   // it textually because one of the path components could be a symlink.
-  if (Dir.startswith("/") || Filename.startswith("/")) {
+  if (Dir.starts_with("/") || Filename.starts_with("/")) {
     if (llvm::sys::path::is_absolute(Filename, llvm::sys::path::Style::posix))
       return Filename;
     Filepath = std::string(Dir);
@@ -910,10 +910,10 @@ static std::string flattenCommandLine(ArrayRef<std::string> Args,
       i++; // Skip this argument and next one.
       continue;
     }
-    if (Arg.startswith("-object-file-name") || Arg == MainFilename)
+    if (Arg.starts_with("-object-file-name") || Arg == MainFilename)
       continue;
     // Skip fmessage-length for reproduciability.
-    if (Arg.startswith("-fmessage-length"))
+    if (Arg.starts_with("-fmessage-length"))
       continue;
     if (PrintedOneArg)
       OS << " ";
@@ -2583,7 +2583,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) {
 
     // Virtual function pointer member.
     if ((Member->getFlags() & DINode::FlagArtificial) &&
-        Member->getName().startswith("_vptr$")) {
+        Member->getName().starts_with("_vptr$")) {
       VFPtrRecord VFPR(getTypeIndex(Member->getBaseType()));
       ContinuationBuilder.writeMemberType(VFPR);
       MemberCount++;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index ce7dd910756a8c..d5ebbfb9668aea 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -445,7 +445,7 @@ DwarfDebug::DwarfDebug(AsmPrinter *A)
 DwarfDebug::~DwarfDebug() = default;
 
 static bool isObjCClass(StringRef Name) {
-  return Name.startswith("+") || Name.startswith("-");
+  return Name.starts_with("+") || Name.starts_with("-");
 }
 
 static bool hasObjCCategory(StringRef Name) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
index 5d813b72c0b751..cd18703b359e1f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
@@ -75,7 +75,7 @@ static bool isPossibleIndirectCallTarget(const Function *F) {
 }
 
 MCSymbol *WinCFGuard::lookupImpSymbol(const MCSymbol *Sym) {
-  if (Sym->getName().startswith("__imp_"))
+  if (Sym->getName().starts_with("__imp_"))
     return nullptr;
   return Asm->OutContext.lookupSymbol(Twine("__imp_") + Sym->getName());
 }

From e96889d36f4528127e649bb90125094c75aa1628 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Mon, 6 Nov 2023 14:27:06 +0100
Subject: [PATCH 18/40] Revert "Fix compression header size check in ELF writer
 (#66888)"

This broke lit tests in zstd enabled builds, see comment on the PR.

> The test had 32-bit and 64-bit header sizes the wrong way around.

This reverts commit c5ecf5a130f087f493802800f3565c7bb75c238a.
---
 llvm/lib/MC/ELFObjectWriter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index e4d18d8a7dd5b5..8490fefe7ff535 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -843,7 +843,7 @@ bool ELFWriter::maybeWriteCompression(
     uint32_t ChType, uint64_t Size,
     SmallVectorImpl<uint8_t> &CompressedContents, Align Alignment) {
   uint64_t HdrSize =
-      is64Bit() ? sizeof(ELF::Elf64_Chdr) : sizeof(ELF::Elf32_Chdr);
+      is64Bit() ? sizeof(ELF::Elf32_Chdr) : sizeof(ELF::Elf64_Chdr);
   if (Size <= HdrSize + CompressedContents.size())
     return false;
   // Platform specific header is followed by compressed data.

From ace4489397d17abfb20d36de1404cfbe102401a7 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 6 Nov 2023 17:34:22 +0400
Subject: [PATCH 19/40] [clang][NFC] Refactor `CXXNewExpr::InitializationStyle`
 (#71322)

This patch converts `CXXNewExpr::InitializationStyle` into a scoped enum at namespace scope. It also affirms the status quo by adding a new enumerator to represent implicit initializer.
---
 .../modernize/MakeSmartPtrCheck.cpp           |  7 ++-
 clang/include/clang/AST/ExprCXX.h             | 48 ++++++++-------
 clang/lib/AST/ExprCXX.cpp                     | 29 ++++-----
 clang/lib/AST/ItaniumMangle.cpp               |  5 +-
 clang/lib/AST/JSONNodeDumper.cpp              | 12 +++-
 clang/lib/AST/StmtPrinter.cpp                 |  7 ++-
 clang/lib/AST/StmtProfile.cpp                 |  2 +-
 clang/lib/Sema/SemaExprCXX.cpp                | 60 ++++++++++---------
 8 files changed, 96 insertions(+), 74 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
index 71fd8eca300c1b..616e57efa76ded 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
@@ -323,7 +323,8 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
     return false;
   };
   switch (New->getInitializationStyle()) {
-  case CXXNewExpr::NoInit: {
+  case CXXNewInitializationStyle::None:
+  case CXXNewInitializationStyle::Implicit: {
     if (ArraySizeExpr.empty()) {
       Diag << FixItHint::CreateRemoval(SourceRange(NewStart, NewEnd));
     } else {
@@ -334,7 +335,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
     }
     break;
   }
-  case CXXNewExpr::CallInit: {
+  case CXXNewInitializationStyle::Call: {
     // FIXME: Add fixes for constructors with parameters that can be created
     // with a C++11 braced-init-list (e.g. std::vector, std::map).
     // Unlike ordinal cases, braced list can not be deduced in
@@ -371,7 +372,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
     }
     break;
   }
-  case CXXNewExpr::ListInit: {
+  case CXXNewInitializationStyle::List: {
     // Range of the substring that we do not want to remove.
     SourceRange InitRange;
     if (const auto *NewConstruct = New->getConstructExpr()) {
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index a106bafcfa3e02..37d310ef967d9c 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -2206,6 +2206,20 @@ class CXXScalarValueInitExpr : public Expr {
   }
 };
 
+enum class CXXNewInitializationStyle {
+  /// New-expression has no initializer as written.
+  None,
+
+  /// New-expression has no written initializer, but has an implicit one.
+  Implicit,
+
+  /// New-expression has a C++98 paren-delimited initializer.
+  Call,
+
+  /// New-expression has a C++11 list-initializer.
+  List
+};
+
 /// Represents a new-expression for memory allocation and constructor
 /// calls, e.g: "new CXXNewExpr(foo)".
 class CXXNewExpr final
@@ -2259,25 +2273,12 @@ class CXXNewExpr final
     return isParenTypeId();
   }
 
-public:
-  enum InitializationStyle {
-    /// New-expression has no initializer as written.
-    NoInit,
-
-    /// New-expression has a C++98 paren-delimited initializer.
-    CallInit,
-
-    /// New-expression has a C++11 list-initializer.
-    ListInit
-  };
-
-private:
   /// Build a c++ new expression.
   CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
              FunctionDecl *OperatorDelete, bool ShouldPassAlignment,
              bool UsualArrayDeleteWantsSize, ArrayRef<Expr *> PlacementArgs,
              SourceRange TypeIdParens, std::optional<Expr *> ArraySize,
-             InitializationStyle InitializationStyle, Expr *Initializer,
+             CXXNewInitializationStyle InitializationStyle, Expr *Initializer,
              QualType Ty, TypeSourceInfo *AllocatedTypeInfo, SourceRange Range,
              SourceRange DirectInitRange);
 
@@ -2292,7 +2293,7 @@ class CXXNewExpr final
          FunctionDecl *OperatorDelete, bool ShouldPassAlignment,
          bool UsualArrayDeleteWantsSize, ArrayRef<Expr *> PlacementArgs,
          SourceRange TypeIdParens, std::optional<Expr *> ArraySize,
-         InitializationStyle InitializationStyle, Expr *Initializer,
+         CXXNewInitializationStyle InitializationStyle, Expr *Initializer,
          QualType Ty, TypeSourceInfo *AllocatedTypeInfo, SourceRange Range,
          SourceRange DirectInitRange);
 
@@ -2388,15 +2389,20 @@ class CXXNewExpr final
 
   /// Whether this new-expression has any initializer at all.
   bool hasInitializer() const {
-    return CXXNewExprBits.StoredInitializationStyle > 0;
+    switch (getInitializationStyle()) {
+    case CXXNewInitializationStyle::None:
+      return false;
+    case CXXNewInitializationStyle::Implicit:
+    case CXXNewInitializationStyle::Call:
+    case CXXNewInitializationStyle::List:
+      return true;
+    }
   }
 
   /// The kind of initializer this new-expression has.
-  InitializationStyle getInitializationStyle() const {
-    if (CXXNewExprBits.StoredInitializationStyle == 0)
-      return NoInit;
-    return static_cast<InitializationStyle>(
-        CXXNewExprBits.StoredInitializationStyle - 1);
+  CXXNewInitializationStyle getInitializationStyle() const {
+    return static_cast<CXXNewInitializationStyle>(
+        CXXNewExprBits.StoredInitializationStyle);
   }
 
   /// The initializer of this new-expression.
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index 4d2e0e9a945a78..83af7998f68338 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -184,7 +184,7 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
                        bool UsualArrayDeleteWantsSize,
                        ArrayRef<Expr *> PlacementArgs, SourceRange TypeIdParens,
                        std::optional<Expr *> ArraySize,
-                       InitializationStyle InitializationStyle,
+                       CXXNewInitializationStyle InitializationStyle,
                        Expr *Initializer, QualType Ty,
                        TypeSourceInfo *AllocatedTypeInfo, SourceRange Range,
                        SourceRange DirectInitRange)
@@ -193,7 +193,9 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
       AllocatedTypeInfo(AllocatedTypeInfo), Range(Range),
       DirectInitRange(DirectInitRange) {
 
-  assert((Initializer != nullptr || InitializationStyle == NoInit) &&
+  assert((Initializer != nullptr ||
+          InitializationStyle == CXXNewInitializationStyle::None ||
+          InitializationStyle == CXXNewInitializationStyle::Implicit) &&
          "Only NoInit can have no initializer!");
 
   CXXNewExprBits.IsGlobalNew = IsGlobalNew;
@@ -201,7 +203,7 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
   CXXNewExprBits.ShouldPassAlignment = ShouldPassAlignment;
   CXXNewExprBits.UsualArrayDeleteWantsSize = UsualArrayDeleteWantsSize;
   CXXNewExprBits.StoredInitializationStyle =
-      Initializer ? InitializationStyle + 1 : 0;
+      llvm::to_underlying(InitializationStyle);
   bool IsParenTypeId = TypeIdParens.isValid();
   CXXNewExprBits.IsParenTypeId = IsParenTypeId;
   CXXNewExprBits.NumPlacementArgs = PlacementArgs.size();
@@ -217,10 +219,10 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
     getTrailingObjects<SourceRange>()[0] = TypeIdParens;
 
   switch (getInitializationStyle()) {
-  case CallInit:
+  case CXXNewInitializationStyle::Call:
     this->Range.setEnd(DirectInitRange.getEnd());
     break;
-  case ListInit:
+  case CXXNewInitializationStyle::List:
     this->Range.setEnd(getInitializer()->getSourceRange().getEnd());
     break;
   default:
@@ -240,15 +242,14 @@ CXXNewExpr::CXXNewExpr(EmptyShell Empty, bool IsArray,
   CXXNewExprBits.IsParenTypeId = IsParenTypeId;
 }
 
-CXXNewExpr *
-CXXNewExpr::Create(const ASTContext &Ctx, bool IsGlobalNew,
-                   FunctionDecl *OperatorNew, FunctionDecl *OperatorDelete,
-                   bool ShouldPassAlignment, bool UsualArrayDeleteWantsSize,
-                   ArrayRef<Expr *> PlacementArgs, SourceRange TypeIdParens,
-                   std::optional<Expr *> ArraySize,
-                   InitializationStyle InitializationStyle, Expr *Initializer,
-                   QualType Ty, TypeSourceInfo *AllocatedTypeInfo,
-                   SourceRange Range, SourceRange DirectInitRange) {
+CXXNewExpr *CXXNewExpr::Create(
+    const ASTContext &Ctx, bool IsGlobalNew, FunctionDecl *OperatorNew,
+    FunctionDecl *OperatorDelete, bool ShouldPassAlignment,
+    bool UsualArrayDeleteWantsSize, ArrayRef<Expr *> PlacementArgs,
+    SourceRange TypeIdParens, std::optional<Expr *> ArraySize,
+    CXXNewInitializationStyle InitializationStyle, Expr *Initializer,
+    QualType Ty, TypeSourceInfo *AllocatedTypeInfo, SourceRange Range,
+    SourceRange DirectInitRange) {
   bool IsArray = ArraySize.has_value();
   bool HasInit = Initializer != nullptr;
   unsigned NumPlacementArgs = PlacementArgs.size();
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 8530675ca2a1ce..5ac8c2e447cdb5 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4826,7 +4826,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity,
     Out << '_';
     mangleType(New->getAllocatedType());
     if (New->hasInitializer()) {
-      if (New->getInitializationStyle() == CXXNewExpr::ListInit)
+      if (New->getInitializationStyle() == CXXNewInitializationStyle::List)
         Out << "il";
       else
         Out << "pi";
@@ -4840,7 +4840,8 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity,
       } else if (const ParenListExpr *PLE = dyn_cast<ParenListExpr>(Init)) {
         for (unsigned i = 0, e = PLE->getNumExprs(); i != e; ++i)
           mangleExpression(PLE->getExpr(i));
-      } else if (New->getInitializationStyle() == CXXNewExpr::ListInit &&
+      } else if (New->getInitializationStyle() ==
+                     CXXNewInitializationStyle::List &&
                  isa<InitListExpr>(Init)) {
         // Only take InitListExprs apart for list-initialization.
         mangleInitListElements(cast<InitListExpr>(Init));
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 1a013b45c615d1..bc7bc7337b15e9 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -1351,9 +1351,15 @@ void JSONNodeDumper::VisitCXXNewExpr(const CXXNewExpr *NE) {
   attributeOnlyIfTrue("isArray", NE->isArray());
   attributeOnlyIfTrue("isPlacement", NE->getNumPlacementArgs() != 0);
   switch (NE->getInitializationStyle()) {
-  case CXXNewExpr::NoInit: break;
-  case CXXNewExpr::CallInit: JOS.attribute("initStyle", "call"); break;
-  case CXXNewExpr::ListInit: JOS.attribute("initStyle", "list"); break;
+  case CXXNewInitializationStyle::None:
+  case CXXNewInitializationStyle::Implicit:
+    break;
+  case CXXNewInitializationStyle::Call:
+    JOS.attribute("initStyle", "call");
+    break;
+  case CXXNewInitializationStyle::List:
+    JOS.attribute("initStyle", "list");
+    break;
   }
   if (const FunctionDecl *FD = NE->getOperatorNew())
     JOS.attribute("operatorNewDecl", createBareDeclRef(FD));
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 44dc9d3780f352..ab4a013de5f552 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -2298,9 +2298,10 @@ void StmtPrinter::VisitCXXNewExpr(CXXNewExpr *E) {
   if (E->isParenTypeId())
     OS << ")";
 
-  CXXNewExpr::InitializationStyle InitStyle = E->getInitializationStyle();
-  if (InitStyle != CXXNewExpr::NoInit) {
-    bool Bare = InitStyle == CXXNewExpr::CallInit &&
+  CXXNewInitializationStyle InitStyle = E->getInitializationStyle();
+  if (InitStyle != CXXNewInitializationStyle::None &&
+      InitStyle != CXXNewInitializationStyle::Implicit) {
+    bool Bare = InitStyle == CXXNewInitializationStyle::Call &&
                 !isa<ParenListExpr>(E->getInitializer());
     if (Bare)
       OS << "(";
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 6510fa369d78eb..8128219dd6f63c 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2096,7 +2096,7 @@ void StmtProfiler::VisitCXXNewExpr(const CXXNewExpr *S) {
   ID.AddInteger(S->getNumPlacementArgs());
   ID.AddBoolean(S->isGlobalNew());
   ID.AddBoolean(S->isParenTypeId());
-  ID.AddInteger(S->getInitializationStyle());
+  ID.AddInteger(llvm::to_underlying(S->getInitializationStyle()));
 }
 
 void
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 25d7759cc168dd..d947aba70d8f36 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1946,7 +1946,7 @@ Sema::ActOnCXXNew(SourceLocation StartLoc, bool UseGlobal,
                      Initializer);
 }
 
-static bool isLegalArrayNewInitializer(CXXNewExpr::InitializationStyle Style,
+static bool isLegalArrayNewInitializer(CXXNewInitializationStyle Style,
                                        Expr *Init) {
   if (!Init)
     return true;
@@ -1957,7 +1957,7 @@ static bool isLegalArrayNewInitializer(CXXNewExpr::InitializationStyle Style,
   else if (CXXConstructExpr *CCE = dyn_cast<CXXConstructExpr>(Init))
     return !CCE->isListInitialization() &&
            CCE->getConstructor()->isDefaultConstructor();
-  else if (Style == CXXNewExpr::ListInit) {
+  else if (Style == CXXNewInitializationStyle::List) {
     assert(isa<InitListExpr>(Init) &&
            "Shouldn't create list CXXConstructExprs for arrays.");
     return true;
@@ -2008,44 +2008,49 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
   SourceRange TypeRange = AllocTypeInfo->getTypeLoc().getSourceRange();
   SourceLocation StartLoc = Range.getBegin();
 
-  CXXNewExpr::InitializationStyle initStyle;
+  CXXNewInitializationStyle InitStyle;
   if (DirectInitRange.isValid()) {
     assert(Initializer && "Have parens but no initializer.");
-    initStyle = CXXNewExpr::CallInit;
+    InitStyle = CXXNewInitializationStyle::Call;
   } else if (Initializer && isa<InitListExpr>(Initializer))
-    initStyle = CXXNewExpr::ListInit;
+    InitStyle = CXXNewInitializationStyle::List;
   else {
     assert((!Initializer || isa<ImplicitValueInitExpr>(Initializer) ||
             isa<CXXConstructExpr>(Initializer)) &&
            "Initializer expression that cannot have been implicitly created.");
-    initStyle = CXXNewExpr::NoInit;
+    InitStyle = CXXNewInitializationStyle::None;
   }
 
   MultiExprArg Exprs(&Initializer, Initializer ? 1 : 0);
   if (ParenListExpr *List = dyn_cast_or_null<ParenListExpr>(Initializer)) {
-    assert(initStyle == CXXNewExpr::CallInit && "paren init for non-call init");
+    assert(InitStyle == CXXNewInitializationStyle::Call &&
+           "paren init for non-call init");
     Exprs = MultiExprArg(List->getExprs(), List->getNumExprs());
   }
 
   // C++11 [expr.new]p15:
   //   A new-expression that creates an object of type T initializes that
   //   object as follows:
-  InitializationKind Kind
-      //     - If the new-initializer is omitted, the object is default-
-      //       initialized (8.5); if no initialization is performed,
-      //       the object has indeterminate value
-      = initStyle == CXXNewExpr::NoInit
-            ? InitializationKind::CreateDefault(TypeRange.getBegin())
-            //     - Otherwise, the new-initializer is interpreted according to
-            //     the
-            //       initialization rules of 8.5 for direct-initialization.
-            : initStyle == CXXNewExpr::ListInit
-                  ? InitializationKind::CreateDirectList(
-                        TypeRange.getBegin(), Initializer->getBeginLoc(),
-                        Initializer->getEndLoc())
-                  : InitializationKind::CreateDirect(TypeRange.getBegin(),
-                                                     DirectInitRange.getBegin(),
-                                                     DirectInitRange.getEnd());
+  InitializationKind Kind = [&] {
+    switch (InitStyle) {
+    //     - If the new-initializer is omitted, the object is default-
+    //       initialized (8.5); if no initialization is performed,
+    //       the object has indeterminate value
+    case CXXNewInitializationStyle::None:
+    case CXXNewInitializationStyle::Implicit:
+      return InitializationKind::CreateDefault(TypeRange.getBegin());
+    //     - Otherwise, the new-initializer is interpreted according to the
+    //       initialization rules of 8.5 for direct-initialization.
+    case CXXNewInitializationStyle::Call:
+      return InitializationKind::CreateDirect(TypeRange.getBegin(),
+                                              DirectInitRange.getBegin(),
+                                              DirectInitRange.getEnd());
+    case CXXNewInitializationStyle::List:
+      return InitializationKind::CreateDirectList(TypeRange.getBegin(),
+                                                  Initializer->getBeginLoc(),
+                                                  Initializer->getEndLoc());
+    }
+  }();
 
   // C++11 [dcl.spec.auto]p6. Deduce the type which 'auto' stands in for.
   auto *Deduced = AllocType->getContainedDeducedType();
@@ -2066,13 +2071,14 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
       return ExprError();
   } else if (Deduced && !Deduced->isDeduced()) {
     MultiExprArg Inits = Exprs;
-    bool Braced = (initStyle == CXXNewExpr::ListInit);
+    bool Braced = (InitStyle == CXXNewInitializationStyle::List);
     if (Braced) {
       auto *ILE = cast<InitListExpr>(Exprs[0]);
       Inits = MultiExprArg(ILE->getInits(), ILE->getNumInits());
     }
 
-    if (initStyle == CXXNewExpr::NoInit || Inits.empty())
+    if (InitStyle == CXXNewInitializationStyle::None ||
+        InitStyle == CXXNewInitializationStyle::Implicit || Inits.empty())
       return ExprError(Diag(StartLoc, diag::err_auto_new_requires_ctor_arg)
                        << AllocType << TypeRange);
     if (Inits.size() > 1) {
@@ -2396,7 +2402,7 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
   // Array 'new' can't have any initializers except empty parentheses.
   // Initializer lists are also allowed, in C++11. Rely on the parser for the
   // dialect distinction.
-  if (ArraySize && !isLegalArrayNewInitializer(initStyle, Initializer)) {
+  if (ArraySize && !isLegalArrayNewInitializer(InitStyle, Initializer)) {
     SourceRange InitRange(Exprs.front()->getBeginLoc(),
                           Exprs.back()->getEndLoc());
     Diag(StartLoc, diag::err_new_array_init_args) << InitRange;
@@ -2468,7 +2474,7 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
 
   return CXXNewExpr::Create(Context, UseGlobal, OperatorNew, OperatorDelete,
                             PassAlignment, UsualArrayDeleteWantsSize,
-                            PlacementArgs, TypeIdParens, ArraySize, initStyle,
+                            PlacementArgs, TypeIdParens, ArraySize, InitStyle,
                             Initializer, ResultType, AllocTypeInfo, Range,
                             DirectInitRange);
 }

From abc27bd31f6064ac8aec75459351db856573b151 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Nov 2023 14:41:03 +0100
Subject: [PATCH 20/40] [InstCombine] Avoid some FP cast constant expressions
 (NFCI)

Instead of doing fptoxi and xitofp casts to check for round-trip,
directly check the IsExact flag on the convertToInteger() API.
---
 .../InstCombine/InstCombineCompares.cpp         | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index f06657c8cd7633..55e26d09cd6e82 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -7324,17 +7324,14 @@ Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
   }
 
   // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
-  // [0, UMAX], but it may still be fractional.  See if it is fractional by
-  // casting the FP value to the integer value and back, checking for equality.
+  // [0, UMAX], but it may still be fractional. Check whether this is the case
+  // using the IsExact flag.
   // Don't do this for zero, because -0.0 is not fractional.
-  Constant *RHSInt = LHSUnsigned
-    ? ConstantExpr::getFPToUI(RHSC, IntTy)
-    : ConstantExpr::getFPToSI(RHSC, IntTy);
+  APSInt RHSInt(IntWidth, LHSUnsigned);
+  bool IsExact;
+  RHS.convertToInteger(RHSInt, APFloat::rmTowardZero, &IsExact);
   if (!RHS.isZero()) {
-    bool Equal = LHSUnsigned
-      ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC
-      : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC;
-    if (!Equal) {
+    if (!IsExact) {
       // If we had a comparison against a fractional value, we have to adjust
       // the compare predicate and sometimes the value.  RHSC is rounded towards
       // zero at this point.
@@ -7400,7 +7397,7 @@ Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
 
   // Lower this FP comparison into an appropriate integer version of the
   // comparison.
-  return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
+  return new ICmpInst(Pred, LHSI->getOperand(0), Builder.getInt(RHSInt));
 }
 
 /// Fold (C / X) < 0.0 --> X < 0.0 if possible. Swap predicate if necessary.

From 22a323e3db0cad736b2e793dd8143195fd9c0fc0 Mon Sep 17 00:00:00 2001
From: sstipanovic <146831748+sstipanovic@users.noreply.github.com>
Date: Mon, 6 Nov 2023 14:52:27 +0100
Subject: [PATCH 21/40] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32
 by constant (#71035)

Instead of: v_mul_lo_u32 v0, v0, 5 we should generate: v_lshl_add_u32
v0, v0, 2, v0.
---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    | 14 +++++++
 llvm/test/CodeGen/AMDGPU/mul.ll               | 39 +++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |  8 ++--
 llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll    | 14 +++----
 llvm/test/CodeGen/AMDGPU/wqm.ll               |  4 +-
 5 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c0e0ac1b4ec887..114d33b077866a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -515,6 +515,16 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
   let HasExtVOP3DPP = 0;
 }
 
+def IsPow2Plus1: PatLeaf<(i32 imm), [{
+  uint32_t V = N->getZExtValue();
+  return isPowerOf2_32(V - 1);
+}]>;
+
+def Log2_32: SDNodeXForm<imm, [{
+  uint32_t V = N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_32(V - 1), SDLoc(N), MVT::i32);
+}]>;
+
 let SubtargetPredicate = isGFX9Plus in {
 let isCommutable = 1, isReMaterializable = 1 in {
   defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -612,6 +622,10 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
 def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
 def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
 
+def : GCNPat<
+ (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
+ (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
+
 let SubtargetPredicate = isGFX940Plus in
 def : GCNPat<
   (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index da7645d5011fc9..179b0a855182ce 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2644,6 +2644,45 @@ entry:
   ret void
 }
 
+define i32 @mul_pow2_plus_1(i32 %val) {
+; SI-LABEL: mul_pow2_plus_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mul_pow2_plus_1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_pow2_plus_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_pow2_plus_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_pow2_plus_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: mul_pow2_plus_1:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+  %mul = mul i32 %val, 9
+  ret i32 %mul
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 60f309c0c1882f..963575d2acc51d 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -141,9 +141,9 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
   ; SI-NEXT: bb.2.Flow:
   ; SI-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4
-  ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4
-  ; SI-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4
+  ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %31:vgpr_32, %bb.1, %10, %bb.4
+  ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %9, %bb.4
+  ; SI-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %34:vgpr_32, %bb.4
   ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.3
   ; SI-NEXT: {{  $}}
@@ -158,7 +158,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
   ; SI-NEXT:   successors: %bb.2(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY2]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec
-  ; SI-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
+  ; SI-NEXT:   [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 killed [[PHI1]], 1, [[PHI1]], implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.5.if.end:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index c71dc06c68d8d6..9183f043f052cb 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -92,20 +92,20 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
 ; SI-NEXT:  .LBB2_1: ; %if.end
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; SI-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; SI-NEXT:    v_add_nc_u32_e32 v2, 1, v0
+; SI-NEXT:    v_add_nc_u32_e32 v2, 1, v3
 ; SI-NEXT:    s_add_i32 s1, s1, 1
 ; SI-NEXT:    s_cmp_lt_i32 s1, s0
 ; SI-NEXT:    s_cbranch_scc0 .LBB2_6
 ; SI-NEXT:  .LBB2_2: ; %for.body
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; SI-NEXT:    s_xor_b32 s2, exec_lo, s2
 ; SI-NEXT:  ; %bb.3: ; %else
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SI-NEXT:    v_mul_lo_u32 v0, v2, 3
-; SI-NEXT:    v_mul_f32_e32 v3, v1, v2
+; SI-NEXT:    v_mul_f32_e32 v0, v1, v2
+; SI-NEXT:    v_lshl_add_u32 v3, v2, 1, v2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:  ; %bb.4: ; %Flow
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
@@ -113,11 +113,11 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
 ; SI-NEXT:    s_cbranch_execz .LBB2_1
 ; SI-NEXT:  ; %bb.5: ; %if
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SI-NEXT:    v_mul_f32_e32 v3, s1, v1
-; SI-NEXT:    v_add_nc_u32_e32 v0, 1, v2
+; SI-NEXT:    v_mul_f32_e32 v0, s1, v1
+; SI-NEXT:    v_add_nc_u32_e32 v3, 1, v2
 ; SI-NEXT:    s_branch .LBB2_1
 ; SI-NEXT:  .LBB2_6: ; %for.end
-; SI-NEXT:    v_add_f32_e32 v0, v0, v3
+; SI-NEXT:    v_add_f32_e32 v0, v3, v0
 ; SI-NEXT:    ; return to shader part epilog
 entry:
 ;  %break = icmp sgt i32 %bound, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 95f947cbca14f0..6bb066f06dd9a2 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1536,7 +1536,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX9-W64-NEXT:  ; %bb.2: ; %Flow
 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
 ; GFX9-W64-NEXT:  ; %bb.3: ; %IF
-; GFX9-W64-NEXT:    v_mul_lo_u32 v0, v5, 3
+; GFX9-W64-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
 ; GFX9-W64-NEXT:  ; %bb.4: ; %END
 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -1566,7 +1566,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX10-W32-NEXT:  ; %bb.2: ; %Flow
 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
 ; GFX10-W32-NEXT:  ; %bb.3: ; %IF
-; GFX10-W32-NEXT:    v_mul_lo_u32 v0, v5, 3
+; GFX10-W32-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
 ; GFX10-W32-NEXT:  ; %bb.4: ; %END
 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12

From 43e5f77500acd64fb2738ef2832691ad488048a8 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 6 Nov 2023 17:54:09 +0400
Subject: [PATCH 22/40] Revert "[clang][NFC] Refactor
 `CXXNewExpr::InitializationStyle`" (#71395)

Reverts llvm/llvm-project#71322
---
 .../modernize/MakeSmartPtrCheck.cpp           |  7 +--
 clang/include/clang/AST/ExprCXX.h             | 48 +++++++--------
 clang/lib/AST/ExprCXX.cpp                     | 29 +++++----
 clang/lib/AST/ItaniumMangle.cpp               |  5 +-
 clang/lib/AST/JSONNodeDumper.cpp              | 12 +---
 clang/lib/AST/StmtPrinter.cpp                 |  7 +--
 clang/lib/AST/StmtProfile.cpp                 |  2 +-
 clang/lib/Sema/SemaExprCXX.cpp                | 60 +++++++++----------
 8 files changed, 74 insertions(+), 96 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
index 616e57efa76ded..71fd8eca300c1b 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
@@ -323,8 +323,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
     return false;
   };
   switch (New->getInitializationStyle()) {
-  case CXXNewInitializationStyle::None:
-  case CXXNewInitializationStyle::Implicit: {
+  case CXXNewExpr::NoInit: {
     if (ArraySizeExpr.empty()) {
       Diag << FixItHint::CreateRemoval(SourceRange(NewStart, NewEnd));
     } else {
@@ -335,7 +334,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
     }
     break;
   }
-  case CXXNewInitializationStyle::Call: {
+  case CXXNewExpr::CallInit: {
     // FIXME: Add fixes for constructors with parameters that can be created
     // with a C++11 braced-init-list (e.g. std::vector, std::map).
     // Unlike ordinal cases, braced list can not be deduced in
@@ -372,7 +371,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
     }
     break;
   }
-  case CXXNewInitializationStyle::List: {
+  case CXXNewExpr::ListInit: {
     // Range of the substring that we do not want to remove.
     SourceRange InitRange;
     if (const auto *NewConstruct = New->getConstructExpr()) {
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 37d310ef967d9c..a106bafcfa3e02 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -2206,20 +2206,6 @@ class CXXScalarValueInitExpr : public Expr {
   }
 };
 
-enum class CXXNewInitializationStyle {
-  /// New-expression has no initializer as written.
-  None,
-
-  /// New-expression has no written initializer, but has an implicit one.
-  Implicit,
-
-  /// New-expression has a C++98 paren-delimited initializer.
-  Call,
-
-  /// New-expression has a C++11 list-initializer.
-  List
-};
-
 /// Represents a new-expression for memory allocation and constructor
 /// calls, e.g: "new CXXNewExpr(foo)".
 class CXXNewExpr final
@@ -2273,12 +2259,25 @@ class CXXNewExpr final
     return isParenTypeId();
   }
 
+public:
+  enum InitializationStyle {
+    /// New-expression has no initializer as written.
+    NoInit,
+
+    /// New-expression has a C++98 paren-delimited initializer.
+    CallInit,
+
+    /// New-expression has a C++11 list-initializer.
+    ListInit
+  };
+
+private:
   /// Build a c++ new expression.
   CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
              FunctionDecl *OperatorDelete, bool ShouldPassAlignment,
              bool UsualArrayDeleteWantsSize, ArrayRef<Expr *> PlacementArgs,
              SourceRange TypeIdParens, std::optional<Expr *> ArraySize,
-             CXXNewInitializationStyle InitializationStyle, Expr *Initializer,
+             InitializationStyle InitializationStyle, Expr *Initializer,
              QualType Ty, TypeSourceInfo *AllocatedTypeInfo, SourceRange Range,
              SourceRange DirectInitRange);
 
@@ -2293,7 +2292,7 @@ class CXXNewExpr final
          FunctionDecl *OperatorDelete, bool ShouldPassAlignment,
          bool UsualArrayDeleteWantsSize, ArrayRef<Expr *> PlacementArgs,
          SourceRange TypeIdParens, std::optional<Expr *> ArraySize,
-         CXXNewInitializationStyle InitializationStyle, Expr *Initializer,
+         InitializationStyle InitializationStyle, Expr *Initializer,
          QualType Ty, TypeSourceInfo *AllocatedTypeInfo, SourceRange Range,
          SourceRange DirectInitRange);
 
@@ -2389,20 +2388,15 @@ class CXXNewExpr final
 
   /// Whether this new-expression has any initializer at all.
   bool hasInitializer() const {
-    switch (getInitializationStyle()) {
-    case CXXNewInitializationStyle::None:
-      return false;
-    case CXXNewInitializationStyle::Implicit:
-    case CXXNewInitializationStyle::Call:
-    case CXXNewInitializationStyle::List:
-      return true;
-    }
+    return CXXNewExprBits.StoredInitializationStyle > 0;
   }
 
   /// The kind of initializer this new-expression has.
-  CXXNewInitializationStyle getInitializationStyle() const {
-    return static_cast<CXXNewInitializationStyle>(
-        CXXNewExprBits.StoredInitializationStyle);
+  InitializationStyle getInitializationStyle() const {
+    if (CXXNewExprBits.StoredInitializationStyle == 0)
+      return NoInit;
+    return static_cast<InitializationStyle>(
+        CXXNewExprBits.StoredInitializationStyle - 1);
   }
 
   /// The initializer of this new-expression.
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index 83af7998f68338..4d2e0e9a945a78 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -184,7 +184,7 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
                        bool UsualArrayDeleteWantsSize,
                        ArrayRef<Expr *> PlacementArgs, SourceRange TypeIdParens,
                        std::optional<Expr *> ArraySize,
-                       CXXNewInitializationStyle InitializationStyle,
+                       InitializationStyle InitializationStyle,
                        Expr *Initializer, QualType Ty,
                        TypeSourceInfo *AllocatedTypeInfo, SourceRange Range,
                        SourceRange DirectInitRange)
@@ -193,9 +193,7 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
       AllocatedTypeInfo(AllocatedTypeInfo), Range(Range),
       DirectInitRange(DirectInitRange) {
 
-  assert((Initializer != nullptr ||
-          InitializationStyle == CXXNewInitializationStyle::None ||
-          InitializationStyle == CXXNewInitializationStyle::Implicit) &&
+  assert((Initializer != nullptr || InitializationStyle == NoInit) &&
          "Only NoInit can have no initializer!");
 
   CXXNewExprBits.IsGlobalNew = IsGlobalNew;
@@ -203,7 +201,7 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
   CXXNewExprBits.ShouldPassAlignment = ShouldPassAlignment;
   CXXNewExprBits.UsualArrayDeleteWantsSize = UsualArrayDeleteWantsSize;
   CXXNewExprBits.StoredInitializationStyle =
-      llvm::to_underlying(InitializationStyle);
+      Initializer ? InitializationStyle + 1 : 0;
   bool IsParenTypeId = TypeIdParens.isValid();
   CXXNewExprBits.IsParenTypeId = IsParenTypeId;
   CXXNewExprBits.NumPlacementArgs = PlacementArgs.size();
@@ -219,10 +217,10 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
     getTrailingObjects<SourceRange>()[0] = TypeIdParens;
 
   switch (getInitializationStyle()) {
-  case CXXNewInitializationStyle::Call:
+  case CallInit:
     this->Range.setEnd(DirectInitRange.getEnd());
     break;
-  case CXXNewInitializationStyle::List:
+  case ListInit:
     this->Range.setEnd(getInitializer()->getSourceRange().getEnd());
     break;
   default:
@@ -242,14 +240,15 @@ CXXNewExpr::CXXNewExpr(EmptyShell Empty, bool IsArray,
   CXXNewExprBits.IsParenTypeId = IsParenTypeId;
 }
 
-CXXNewExpr *CXXNewExpr::Create(
-    const ASTContext &Ctx, bool IsGlobalNew, FunctionDecl *OperatorNew,
-    FunctionDecl *OperatorDelete, bool ShouldPassAlignment,
-    bool UsualArrayDeleteWantsSize, ArrayRef<Expr *> PlacementArgs,
-    SourceRange TypeIdParens, std::optional<Expr *> ArraySize,
-    CXXNewInitializationStyle InitializationStyle, Expr *Initializer,
-    QualType Ty, TypeSourceInfo *AllocatedTypeInfo, SourceRange Range,
-    SourceRange DirectInitRange) {
+CXXNewExpr *
+CXXNewExpr::Create(const ASTContext &Ctx, bool IsGlobalNew,
+                   FunctionDecl *OperatorNew, FunctionDecl *OperatorDelete,
+                   bool ShouldPassAlignment, bool UsualArrayDeleteWantsSize,
+                   ArrayRef<Expr *> PlacementArgs, SourceRange TypeIdParens,
+                   std::optional<Expr *> ArraySize,
+                   InitializationStyle InitializationStyle, Expr *Initializer,
+                   QualType Ty, TypeSourceInfo *AllocatedTypeInfo,
+                   SourceRange Range, SourceRange DirectInitRange) {
   bool IsArray = ArraySize.has_value();
   bool HasInit = Initializer != nullptr;
   unsigned NumPlacementArgs = PlacementArgs.size();
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 5ac8c2e447cdb5..8530675ca2a1ce 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4826,7 +4826,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity,
     Out << '_';
     mangleType(New->getAllocatedType());
     if (New->hasInitializer()) {
-      if (New->getInitializationStyle() == CXXNewInitializationStyle::List)
+      if (New->getInitializationStyle() == CXXNewExpr::ListInit)
         Out << "il";
       else
         Out << "pi";
@@ -4840,8 +4840,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity,
       } else if (const ParenListExpr *PLE = dyn_cast<ParenListExpr>(Init)) {
         for (unsigned i = 0, e = PLE->getNumExprs(); i != e; ++i)
           mangleExpression(PLE->getExpr(i));
-      } else if (New->getInitializationStyle() ==
-                     CXXNewInitializationStyle::List &&
+      } else if (New->getInitializationStyle() == CXXNewExpr::ListInit &&
                  isa<InitListExpr>(Init)) {
         // Only take InitListExprs apart for list-initialization.
         mangleInitListElements(cast<InitListExpr>(Init));
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index bc7bc7337b15e9..1a013b45c615d1 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -1351,15 +1351,9 @@ void JSONNodeDumper::VisitCXXNewExpr(const CXXNewExpr *NE) {
   attributeOnlyIfTrue("isArray", NE->isArray());
   attributeOnlyIfTrue("isPlacement", NE->getNumPlacementArgs() != 0);
   switch (NE->getInitializationStyle()) {
-  case CXXNewInitializationStyle::None:
-  case CXXNewInitializationStyle::Implicit:
-    break;
-  case CXXNewInitializationStyle::Call:
-    JOS.attribute("initStyle", "call");
-    break;
-  case CXXNewInitializationStyle::List:
-    JOS.attribute("initStyle", "list");
-    break;
+  case CXXNewExpr::NoInit: break;
+  case CXXNewExpr::CallInit: JOS.attribute("initStyle", "call"); break;
+  case CXXNewExpr::ListInit: JOS.attribute("initStyle", "list"); break;
   }
   if (const FunctionDecl *FD = NE->getOperatorNew())
     JOS.attribute("operatorNewDecl", createBareDeclRef(FD));
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index ab4a013de5f552..44dc9d3780f352 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -2298,10 +2298,9 @@ void StmtPrinter::VisitCXXNewExpr(CXXNewExpr *E) {
   if (E->isParenTypeId())
     OS << ")";
 
-  CXXNewInitializationStyle InitStyle = E->getInitializationStyle();
-  if (InitStyle != CXXNewInitializationStyle::None &&
-      InitStyle != CXXNewInitializationStyle::Implicit) {
-    bool Bare = InitStyle == CXXNewInitializationStyle::Call &&
+  CXXNewExpr::InitializationStyle InitStyle = E->getInitializationStyle();
+  if (InitStyle != CXXNewExpr::NoInit) {
+    bool Bare = InitStyle == CXXNewExpr::CallInit &&
                 !isa<ParenListExpr>(E->getInitializer());
     if (Bare)
       OS << "(";
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 8128219dd6f63c..6510fa369d78eb 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2096,7 +2096,7 @@ void StmtProfiler::VisitCXXNewExpr(const CXXNewExpr *S) {
   ID.AddInteger(S->getNumPlacementArgs());
   ID.AddBoolean(S->isGlobalNew());
   ID.AddBoolean(S->isParenTypeId());
-  ID.AddInteger(llvm::to_underlying(S->getInitializationStyle()));
+  ID.AddInteger(S->getInitializationStyle());
 }
 
 void
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index d947aba70d8f36..25d7759cc168dd 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1946,7 +1946,7 @@ Sema::ActOnCXXNew(SourceLocation StartLoc, bool UseGlobal,
                      Initializer);
 }
 
-static bool isLegalArrayNewInitializer(CXXNewInitializationStyle Style,
+static bool isLegalArrayNewInitializer(CXXNewExpr::InitializationStyle Style,
                                        Expr *Init) {
   if (!Init)
     return true;
@@ -1957,7 +1957,7 @@ static bool isLegalArrayNewInitializer(CXXNewInitializationStyle Style,
   else if (CXXConstructExpr *CCE = dyn_cast<CXXConstructExpr>(Init))
     return !CCE->isListInitialization() &&
            CCE->getConstructor()->isDefaultConstructor();
-  else if (Style == CXXNewInitializationStyle::List) {
+  else if (Style == CXXNewExpr::ListInit) {
     assert(isa<InitListExpr>(Init) &&
            "Shouldn't create list CXXConstructExprs for arrays.");
     return true;
@@ -2008,49 +2008,44 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
   SourceRange TypeRange = AllocTypeInfo->getTypeLoc().getSourceRange();
   SourceLocation StartLoc = Range.getBegin();
 
-  CXXNewInitializationStyle InitStyle;
+  CXXNewExpr::InitializationStyle initStyle;
   if (DirectInitRange.isValid()) {
     assert(Initializer && "Have parens but no initializer.");
-    InitStyle = CXXNewInitializationStyle::Call;
+    initStyle = CXXNewExpr::CallInit;
   } else if (Initializer && isa<InitListExpr>(Initializer))
-    InitStyle = CXXNewInitializationStyle::List;
+    initStyle = CXXNewExpr::ListInit;
   else {
     assert((!Initializer || isa<ImplicitValueInitExpr>(Initializer) ||
             isa<CXXConstructExpr>(Initializer)) &&
            "Initializer expression that cannot have been implicitly created.");
-    InitStyle = CXXNewInitializationStyle::None;
+    initStyle = CXXNewExpr::NoInit;
   }
 
   MultiExprArg Exprs(&Initializer, Initializer ? 1 : 0);
   if (ParenListExpr *List = dyn_cast_or_null<ParenListExpr>(Initializer)) {
-    assert(InitStyle == CXXNewInitializationStyle::Call &&
-           "paren init for non-call init");
+    assert(initStyle == CXXNewExpr::CallInit && "paren init for non-call init");
     Exprs = MultiExprArg(List->getExprs(), List->getNumExprs());
   }
 
   // C++11 [expr.new]p15:
   //   A new-expression that creates an object of type T initializes that
   //   object as follows:
-  InitializationKind Kind = [&] {
-    switch (InitStyle) {
-    //     - If the new-initializer is omitted, the object is default-
-    //       initialized (8.5); if no initialization is performed,
-    //       the object has indeterminate value
-    case CXXNewInitializationStyle::None:
-    case CXXNewInitializationStyle::Implicit:
-      return InitializationKind::CreateDefault(TypeRange.getBegin());
-    //     - Otherwise, the new-initializer is interpreted according to the
-    //       initialization rules of 8.5 for direct-initialization.
-    case CXXNewInitializationStyle::Call:
-      return InitializationKind::CreateDirect(TypeRange.getBegin(),
-                                              DirectInitRange.getBegin(),
-                                              DirectInitRange.getEnd());
-    case CXXNewInitializationStyle::List:
-      return InitializationKind::CreateDirectList(TypeRange.getBegin(),
-                                                  Initializer->getBeginLoc(),
-                                                  Initializer->getEndLoc());
-    }
-  }();
+  InitializationKind Kind
+      //     - If the new-initializer is omitted, the object is default-
+      //       initialized (8.5); if no initialization is performed,
+      //       the object has indeterminate value
+      = initStyle == CXXNewExpr::NoInit
+            ? InitializationKind::CreateDefault(TypeRange.getBegin())
+            //     - Otherwise, the new-initializer is interpreted according to
+            //     the
+            //       initialization rules of 8.5 for direct-initialization.
+            : initStyle == CXXNewExpr::ListInit
+                  ? InitializationKind::CreateDirectList(
+                        TypeRange.getBegin(), Initializer->getBeginLoc(),
+                        Initializer->getEndLoc())
+                  : InitializationKind::CreateDirect(TypeRange.getBegin(),
+                                                     DirectInitRange.getBegin(),
+                                                     DirectInitRange.getEnd());
 
   // C++11 [dcl.spec.auto]p6. Deduce the type which 'auto' stands in for.
   auto *Deduced = AllocType->getContainedDeducedType();
@@ -2071,14 +2066,13 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
       return ExprError();
   } else if (Deduced && !Deduced->isDeduced()) {
     MultiExprArg Inits = Exprs;
-    bool Braced = (InitStyle == CXXNewInitializationStyle::List);
+    bool Braced = (initStyle == CXXNewExpr::ListInit);
     if (Braced) {
       auto *ILE = cast<InitListExpr>(Exprs[0]);
       Inits = MultiExprArg(ILE->getInits(), ILE->getNumInits());
     }
 
-    if (InitStyle == CXXNewInitializationStyle::None ||
-        InitStyle == CXXNewInitializationStyle::Implicit || Inits.empty())
+    if (initStyle == CXXNewExpr::NoInit || Inits.empty())
       return ExprError(Diag(StartLoc, diag::err_auto_new_requires_ctor_arg)
                        << AllocType << TypeRange);
     if (Inits.size() > 1) {
@@ -2402,7 +2396,7 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
   // Array 'new' can't have any initializers except empty parentheses.
   // Initializer lists are also allowed, in C++11. Rely on the parser for the
   // dialect distinction.
-  if (ArraySize && !isLegalArrayNewInitializer(InitStyle, Initializer)) {
+  if (ArraySize && !isLegalArrayNewInitializer(initStyle, Initializer)) {
     SourceRange InitRange(Exprs.front()->getBeginLoc(),
                           Exprs.back()->getEndLoc());
     Diag(StartLoc, diag::err_new_array_init_args) << InitRange;
@@ -2474,7 +2468,7 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
 
   return CXXNewExpr::Create(Context, UseGlobal, OperatorNew, OperatorDelete,
                             PassAlignment, UsualArrayDeleteWantsSize,
-                            PlacementArgs, TypeIdParens, ArraySize, InitStyle,
+                            PlacementArgs, TypeIdParens, ArraySize, initStyle,
                             Initializer, ResultType, AllocTypeInfo, Range,
                             DirectInitRange);
 }

From bdac9720712aedae04ba96c53775d59e80aae69b Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 6 Nov 2023 14:59:26 +0100
Subject: [PATCH 23/40] Fix load64_aligned (#71391)

Fix #64758 `load64_aligned` was missing a case for `alignment == 6`.
---
 libc/src/string/memory_utils/generic/aligned_access.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libc/src/string/memory_utils/generic/aligned_access.h b/libc/src/string/memory_utils/generic/aligned_access.h
index 4238167e54d644..65bc63f6cbe557 100644
--- a/libc/src/string/memory_utils/generic/aligned_access.h
+++ b/libc/src/string/memory_utils/generic/aligned_access.h
@@ -28,7 +28,7 @@ namespace LIBC_NAMESPACE {
     return load32_aligned<uint32_t>(ptr, offset);
   else if (alignment == 2)
     return load32_aligned<uint16_t, uint16_t>(ptr, offset);
-  else
+  else // 1, 3
     return load32_aligned<uint8_t, uint16_t, uint8_t>(ptr, offset);
 }
 
@@ -38,9 +38,11 @@ namespace LIBC_NAMESPACE {
     return load64_aligned<uint64_t>(ptr, offset);
   else if (alignment == 4)
     return load64_aligned<uint32_t, uint32_t>(ptr, offset);
+  else if (alignment == 6)
+    return load64_aligned<uint16_t, uint32_t, uint16_t>(ptr, offset);
   else if (alignment == 2)
     return load64_aligned<uint16_t, uint16_t, uint16_t, uint16_t>(ptr, offset);
-  else
+  else // 1, 3, 5, 7
     return load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
         ptr, offset);
 }

From feedb7c0db1e06b9082f9c015362cdbf334696ff Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 6 Nov 2023 15:13:43 +0100
Subject: [PATCH 24/40] [clang][Interp] Fix IntAP(s) to IntAP(s) casts (#69915)

This was still assert(false)'ed out, it's for casts between two IntAP/IntAPS expressions.
We can't just short-circuit for FromT == ToT because we need to consider the bitwidth when doing the cast.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp |  7 ++-----
 clang/test/AST/Interp/intap.cpp          | 11 +++++++++++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 485893d58f487a..a2cf682b2532bd 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -200,16 +200,13 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (!this->visit(SubExpr))
       return false;
 
-    if (FromT == ToT) {
-      assert(ToT != PT_IntAP && ToT != PT_IntAPS);
-      return true;
-    }
-
     if (ToT == PT_IntAP)
       return this->emitCastAP(*FromT, Ctx.getBitWidth(CE->getType()), CE);
     if (ToT == PT_IntAPS)
       return this->emitCastAPS(*FromT, Ctx.getBitWidth(CE->getType()), CE);
 
+    if (FromT == ToT)
+      return true;
     return this->emitCast(*FromT, *ToT, CE);
   }
 
diff --git a/clang/test/AST/Interp/intap.cpp b/clang/test/AST/Interp/intap.cpp
index db9f516131af47..45961e6fc74b7a 100644
--- a/clang/test/AST/Interp/intap.cpp
+++ b/clang/test/AST/Interp/intap.cpp
@@ -30,6 +30,17 @@ static_assert(UBitIntZero1 == 0, "");
 constexpr unsigned _BitInt(2) BI1 = 3u;
 static_assert(BI1 == 3, "");
 
+namespace APCast {
+  constexpr _BitInt(10) A = 1;
+  constexpr _BitInt(11) B = A;
+  static_assert(B == 1, "");
+  constexpr _BitInt(16) B2 = A;
+  static_assert(B2 == 1, "");
+  constexpr _BitInt(32) B3 = A;
+  static_assert(B3 == 1, "");
+  constexpr unsigned _BitInt(32) B4 = A;
+  static_assert(B4 == 1, "");
+}
 
 #ifdef __SIZEOF_INT128__
 namespace i128 {

From fe6893b1d850795b386b0e5de6fdb3f6af31d87d Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@gmail.com>
Date: Mon, 6 Nov 2023 15:16:49 +0100
Subject: [PATCH 25/40] Improve selection of conditional branch on
 amdgcn.ballot!=0 condition in SelectionDAG. (#68714)

Improve selection of the following pattern:

bool cnd = ...
if (amdgcn.ballot(cnd) != 0) {
  ...
}

which means "execute _then_ if any lane has satisfied the _cnd_
condition".
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  72 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   4 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   4 +
 .../GlobalISel/llvm.amdgcn.ballot.i32.ll      | 389 ++++++++++++++
 .../GlobalISel/llvm.amdgcn.ballot.i64.ll      | 336 ++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll  | 494 ++++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll  | 425 +++++++++++++++
 7 files changed, 1719 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 804ffb90b53024..cd810f0b43e50d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -19,6 +19,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600RegisterInfo.h"
+#include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -2263,6 +2264,34 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
   return false;
 }
 
+static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
+  assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
+  // Special case for amdgcn.ballot:
+  // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
+  // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
+  // =>
+  // Use i1 %Cond value instead of i(WaveSize) %VCMP.
+  // This is possible because divergent ISD::SETCC is selected as V_CMP and
+  // Cond becomes a i(WaveSize) full mask value.
+  // Note that ballot doesn't use SETEQ condition but its easy to support it
+  // here for completeness, so in this case Negate is set true on return.
+  auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
+  auto *VCMP_CRHS = dyn_cast<ConstantSDNode>(VCMP.getOperand(1));
+  if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) && VCMP_CRHS &&
+      VCMP_CRHS->isZero()) {
+
+    auto Cond = VCMP.getOperand(0);
+    if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
+      Cond = Cond.getOperand(0);
+
+    if (isBoolSGPR(Cond)) {
+      Negate = VCMP_CC == ISD::SETEQ;
+      return Cond;
+    }
+  }
+  return SDValue();
+}
+
 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
   SDValue Cond = N->getOperand(1);
 
@@ -2276,11 +2305,50 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
   const SIRegisterInfo *TRI = ST->getRegisterInfo();
 
   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
-  unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
+  bool AndExec = !UseSCCBr;
+  bool Negate = false;
+
+  if (Cond.getOpcode() == ISD::SETCC &&
+      Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
+    SDValue VCMP = Cond->getOperand(0);
+    auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
+    auto *CRHS = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
+    if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero() &&
+        // TODO: make condition below an assert after fixing ballot bitwidth.
+        VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
+      // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
+      // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
+      // BRCOND i1 %C, %BB
+      // =>
+      // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
+      // VCC = COPY i(WaveSize) %VCMP
+      // S_CBRANCH_VCCNZ/VCCZ %BB
+      Negate = CC == ISD::SETEQ;
+      bool NegatedBallot = false;
+      if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
+        Cond = BallotCond;
+        UseSCCBr = !BallotCond->isDivergent();
+        Negate = Negate ^ NegatedBallot;
+      } else {
+        // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
+        // selected as V_CMP, but this may change for uniform condition.
+        Cond = VCMP;
+        UseSCCBr = false;
+      }
+    }
+    // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
+    // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
+    // used.
+    AndExec = false;
+  }
+
+  unsigned BrOp =
+      UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
+               : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
   Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
   SDLoc SL(N);
 
-  if (!UseSCCBr) {
+  if (AndExec) {
     // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
     // analyzed what generates the vcc value, so we do not know whether vcc
     // bits for disabled lanes are 0.  Thus we need to mask out bits for
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f4f4d095fba3ce..5c46d81f57af6a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10628,9 +10628,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
   return SDValue();
 }
 
-// Returns true if argument is a boolean value which is not serialized into
-// memory or argument and does not require v_cndmask_b32 to be deserialized.
-static bool isBoolSGPR(SDValue V) {
+bool llvm::isBoolSGPR(SDValue V) {
   if (V.getValueType() != MVT::i1)
     return false;
   switch (V.getOpcode()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d717e12d29a514..746a88c5ea13a3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -586,6 +586,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   getTargetMMOFlags(const Instruction &I) const override;
 };
 
+// Returns true if argument is a boolean value which is not serialized into
+// memory or argument and does not require v_cndmask_b32 to be deserialized.
+bool isBoolSGPR(SDValue V);
+
 } // End namespace llvm
 
 #endif
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 8bd1be04650e00..d641063984eb81 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare i32 @llvm.ctpop.i32(i32)
 
 ; Test ballot(0)
@@ -83,3 +84,391 @@ define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
   %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
   ret i32 %bcnt
 }
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT:    s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB7_3
+; CHECK-NEXT:  .LBB7_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB7_3
+; CHECK-NEXT:  .LBB7_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB8_3
+; CHECK-NEXT:  .LBB8_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB8_3
+; CHECK-NEXT:  .LBB8_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT:    s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB9_3
+; CHECK-NEXT:  .LBB9_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB9_3
+; CHECK-NEXT:  .LBB9_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB11_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB11_3
+; CHECK-NEXT:  .LBB11_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB11_3
+; CHECK-NEXT:  .LBB11_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot64_ne_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e64 s0, 12, v0
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB12_3
+; CHECK-NEXT:  .LBB12_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB12_3
+; CHECK-NEXT:  .LBB12_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB13_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB13_3
+; CHECK-NEXT:  .LBB13_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB13_3
+; CHECK-NEXT:  .LBB13_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB15_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB15_3
+; CHECK-NEXT:  .LBB15_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB15_3
+; CHECK-NEXT:  .LBB15_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB16_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB16_3
+; CHECK-NEXT:  .LBB16_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB16_3
+; CHECK-NEXT:  .LBB16_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot64_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB17_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB17_3
+; CHECK-NEXT:  .LBB17_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB17_3
+; CHECK-NEXT:  .LBB17_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB19_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB19_3
+; CHECK-NEXT:  .LBB19_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB19_3
+; CHECK-NEXT:  .LBB19_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB20_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB20_3
+; CHECK-NEXT:  .LBB20_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB20_3
+; CHECK-NEXT:  .LBB20_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_cmp_le_i32 s0, 22
+; CHECK-NEXT:    s_cbranch_scc1 .LBB21_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB21_3
+; CHECK-NEXT:  .LBB21_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB21_3
+; CHECK-NEXT:  .LBB21_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %bc = icmp sgt i32 %ballot, 22
+  br i1 %bc, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 9f83012f545750..d879779962dfca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -86,3 +86,339 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
   %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
   ret i64 %bcnt
 }
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB7_3
+; CHECK-NEXT:  .LBB7_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB7_3
+; CHECK-NEXT:  .LBB7_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB8_3
+; CHECK-NEXT:  .LBB8_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB8_3
+; CHECK-NEXT:  .LBB8_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB9_3
+; CHECK-NEXT:  .LBB9_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB9_3
+; CHECK-NEXT:  .LBB9_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB11_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB11_3
+; CHECK-NEXT:  .LBB11_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB11_3
+; CHECK-NEXT:  .LBB11_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB12_3
+; CHECK-NEXT:  .LBB12_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB12_3
+; CHECK-NEXT:  .LBB12_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB13_3
+; CHECK-NEXT:  .LBB13_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB13_3
+; CHECK-NEXT:  .LBB13_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB15_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB15_3
+; CHECK-NEXT:  .LBB15_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB15_3
+; CHECK-NEXT:  .LBB15_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB16_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB16_3
+; CHECK-NEXT:  .LBB16_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB16_3
+; CHECK-NEXT:  .LBB16_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB17_3
+; CHECK-NEXT:  .LBB17_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB17_3
+; CHECK-NEXT:  .LBB17_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    v_cmp_le_i64_e64 vcc, s[0:1], 22
+; CHECK-NEXT:    s_cbranch_vccnz .LBB19_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB19_3
+; CHECK-NEXT:  .LBB19_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB19_3
+; CHECK-NEXT:  .LBB19_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %bc = icmp sgt i64 %ballot, 22
+  br i1 %bc, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 7e719e86b521fd..3337d053eb930b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare i32 @llvm.ctpop.i32(i32)
 
 ; Test ballot(0)
@@ -83,3 +84,496 @@ define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
   %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
   ret i32 %bcnt
 }
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT:    s_cbranch_vccz .LBB7_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB7_3
+; CHECK-NEXT:  .LBB7_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB7_3
+; CHECK-NEXT:  .LBB7_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc_lo, s0, 0
+; CHECK-NEXT:    s_cbranch_vccz .LBB8_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB8_3
+; CHECK-NEXT:  .LBB8_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB8_3
+; CHECK-NEXT:  .LBB8_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT:    s_cbranch_vccz .LBB9_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB9_3
+; CHECK-NEXT:  .LBB9_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB9_3
+; CHECK-NEXT:  .LBB9_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc_lo, s0, 0
+; CHECK-NEXT:    s_cbranch_vccz .LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    s_cbranch_vccz .LBB11_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB11_3
+; CHECK-NEXT:  .LBB11_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB11_3
+; CHECK-NEXT:  .LBB11_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot64_ne_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e64 s0, 12, v0
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB12_3
+; CHECK-NEXT:  .LBB12_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB12_3
+; CHECK-NEXT:  .LBB12_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc_lo, s0, 12
+; CHECK-NEXT:    s_cbranch_vccz .LBB13_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB13_3
+; CHECK-NEXT:  .LBB13_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB13_3
+; CHECK-NEXT:  .LBB13_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    s_cbranch_vccz .LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc_lo, s0, 12
+; CHECK-NEXT:    s_cbranch_vccz .LBB15_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB15_3
+; CHECK-NEXT:  .LBB15_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB15_3
+; CHECK-NEXT:  .LBB15_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT:    s_cbranch_vccz .LBB16_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB16_3
+; CHECK-NEXT:  .LBB16_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB16_3
+; CHECK-NEXT:  .LBB16_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot64_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB17_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB17_3
+; CHECK-NEXT:  .LBB17_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB17_3
+; CHECK-NEXT:  .LBB17_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT:    s_cbranch_vccz .LBB19_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB19_3
+; CHECK-NEXT:  .LBB19_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB19_3
+; CHECK-NEXT:  .LBB19_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT:    s_cbranch_scc0 .LBB20_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB20_3
+; CHECK-NEXT:  .LBB20_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB20_3
+; CHECK-NEXT:  .LBB20_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, s0, 12
+; CHECK-NEXT:    s_cmp_lt_i32 s0, 23
+; CHECK-NEXT:    s_cbranch_scc1 .LBB21_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB21_3
+; CHECK-NEXT:  .LBB21_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB21_3
+; CHECK-NEXT:  .LBB21_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+  %bc = icmp sgt i32 %ballot, 22
+  br i1 %bc, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+declare i32 @llvm.amdgcn.icmp.i32(i1, i1, i32)
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT:    s_cbranch_vccnz .LBB22_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB22_3
+; CHECK-NEXT:  .LBB22_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB22_3
+; CHECK-NEXT:  .LBB22_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; TODO:
+;   s_cmp_lt_u32 s0, 12
+;   s_cselect_b32 s0, -1, 0
+;   s_cmp_gt_u32 s1, 34
+;   s_cselect_b32 s1, -1, 0
+;   s_and_b32 s0, s0, s1
+;   s_and_b32 s0, s0, exec_lo
+; could be improved to:
+;   s_cmp_lt_u32 s0, 12
+;   s_cselect_b32 s0, -1, 0
+;   s_cmp_gt_u32 s1, 34
+;   s_cselect_b32 s0, s0, 0
+;   s_and_b32 s0, s0, exec_lo
+; By selecting into vcc(_lo) instead, we could even avoid the AND-with-exec.
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT:    s_cbranch_scc1 .LBB23_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB23_3
+; CHECK-NEXT:  .LBB23_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB23_3
+; CHECK-NEXT:  .LBB23_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT:    s_cbranch_vccnz .LBB24_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB24_3
+; CHECK-NEXT:  .LBB24_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB24_3
+; CHECK-NEXT:  .LBB24_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT:    s_cbranch_scc1 .LBB25_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB25_3
+; CHECK-NEXT:  .LBB25_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB25_3
+; CHECK-NEXT:  .LBB25_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index 473159ec20e552..217f930a643da7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -86,3 +86,428 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
   %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
   ret i64 %bcnt
 }
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    s_cbranch_vccz .LBB7_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB7_3
+; CHECK-NEXT:  .LBB7_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB7_3
+; CHECK-NEXT:  .LBB7_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc, s0, 0
+; CHECK-NEXT:    s_cbranch_vccz .LBB8_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB8_3
+; CHECK-NEXT:  .LBB8_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB8_3
+; CHECK-NEXT:  .LBB8_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    s_cbranch_vccz .LBB9_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB9_3
+; CHECK-NEXT:  .LBB9_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB9_3
+; CHECK-NEXT:  .LBB9_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    v_cmp_ne_u32_e64 vcc, s0, 0
+; CHECK-NEXT:    s_cbranch_vccz .LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_3:
+  %c = trunc i32 %v to i1
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    s_cbranch_vccz .LBB11_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB11_3
+; CHECK-NEXT:  .LBB11_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB11_3
+; CHECK-NEXT:  .LBB11_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc, s0, 12
+; CHECK-NEXT:    s_cbranch_vccz .LBB12_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB12_3
+; CHECK-NEXT:  .LBB12_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB12_3
+; CHECK-NEXT:  .LBB12_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    s_cbranch_vccz .LBB13_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB13_3
+; CHECK-NEXT:  .LBB13_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB13_3
+; CHECK-NEXT:  .LBB13_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_lt_u32_e64 vcc, s0, 12
+; CHECK-NEXT:    s_cbranch_vccz .LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT:    s_cbranch_vccz .LBB15_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB15_3
+; CHECK-NEXT:  .LBB15_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB15_3
+; CHECK-NEXT:  .LBB15_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_cbranch_scc0 .LBB16_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB16_3
+; CHECK-NEXT:  .LBB16_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB16_3
+; CHECK-NEXT:  .LBB16_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT:    s_cbranch_vccz .LBB17_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB17_3
+; CHECK-NEXT:  .LBB17_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB17_3
+; CHECK-NEXT:  .LBB17_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, 12
+; CHECK-NEXT:    v_cmp_lt_i64_e64 vcc, s[0:1], 23
+; CHECK-NEXT:    s_cbranch_vccnz .LBB19_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB19_3
+; CHECK-NEXT:  .LBB19_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB19_3
+; CHECK-NEXT:  .LBB19_3:
+  %c = icmp ult i32 %v, 12
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+  %bc = icmp sgt i64 %ballot, 22
+  br i1 %bc, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+declare i64 @llvm.amdgcn.icmp.i64(i1, i1, i32)
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT:    s_cbranch_vccnz .LBB20_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB20_3
+; CHECK-NEXT:  .LBB20_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB20_3
+; CHECK-NEXT:  .LBB20_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_cbranch_scc1 .LBB21_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB21_3
+; CHECK-NEXT:  .LBB21_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB21_3
+; CHECK-NEXT:  .LBB21_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT:    s_cbranch_vccnz .LBB22_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB22_3
+; CHECK-NEXT:  .LBB22_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB22_3
+; CHECK-NEXT:  .LBB22_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_cbranch_scc1 .LBB23_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB23_3
+; CHECK-NEXT:  .LBB23_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB23_3
+; CHECK-NEXT:  .LBB23_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
\ No newline at end of file

From 25af06fd7a6b2a9def447a90dc10b73424a31600 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Nov 2023 15:17:32 +0100
Subject: [PATCH 26/40] [InstCombine] Avoid use of FP cast constant expressions
 (NFC)

Use the constant folding API instead. As we're working on plain
ConstantFP, this should always succeed.
---
 llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 87181650e75872..22fd3edc39acb0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1841,10 +1841,11 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
     // instcombined.
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
       if (IsValidPromotion(FPType, LHSIntVal->getType())) {
-        Constant *CI =
-          ConstantExpr::getFPToSI(CFP, LHSIntVal->getType());
+        Constant *CI = ConstantFoldCastOperand(Instruction::FPToSI, CFP,
+                                               LHSIntVal->getType(), DL);
         if (LHSConv->hasOneUse() &&
-            ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
+            ConstantFoldCastOperand(Instruction::SIToFP, CI, I.getType(), DL) ==
+                CFP &&
             willNotOverflowSignedAdd(LHSIntVal, CI, I)) {
           // Insert the new integer add.
           Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, CI, "addconv");

From 16a595e398f3e4f1b7374208fc32c4059b4df84a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Nov 2023 15:27:01 +0100
Subject: [PATCH 27/40] [Attributor] Avoid use of ConstantExpr::getFPTrunc()
 (NFC)

Use the constant folding API instead. For simplificity I'm using
the DL-independent API here.
---
 llvm/lib/Transforms/IPO/Attributor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 49ced893d5c734..50e4a8b87a5d2d 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -330,7 +330,7 @@ Value *AA::getWithType(Value &V, Type &Ty) {
       if (C->getType()->isIntegerTy() && Ty.isIntegerTy())
         return ConstantExpr::getTrunc(C, &Ty, /* OnlyIfReduced */ true);
       if (C->getType()->isFloatingPointTy() && Ty.isFloatingPointTy())
-        return ConstantExpr::getFPTrunc(C, &Ty, /* OnlyIfReduced */ true);
+        return ConstantFoldCastInstruction(Instruction::FPTrunc, C, &Ty);
     }
   }
   return nullptr;

From be3cef0b2a4274c48f16a7b6cb80572091a6948f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Nov 2023 15:38:42 +0100
Subject: [PATCH 28/40] [LibCallsShrinkWrap] Avoid use of
 ConstantExpr::getFPExtend() (NFC)

Use the constant folding API instead.
---
 llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index cdcfb5050bff36..6220f850930969 100644
--- a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -101,7 +101,7 @@ class LibCallsShrinkWrap : public InstVisitor<LibCallsShrinkWrap> {
                     float Val) {
     Constant *V = ConstantFP::get(BBBuilder.getContext(), APFloat(Val));
     if (!Arg->getType()->isFloatTy())
-      V = ConstantExpr::getFPExtend(V, Arg->getType());
+      V = ConstantFoldCastInstruction(Instruction::FPExt, V, Arg->getType());
     if (BBBuilder.GetInsertBlock()->getParent()->hasFnAttribute(Attribute::StrictFP))
       BBBuilder.setIsFPConstrained(true);
     return BBBuilder.CreateFCmp(Cmp, Arg, V);

From f9404a1b57a5e6050138ceeea327cfd2a4c84f06 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Nov 2023 15:42:02 +0100
Subject: [PATCH 29/40] [AMDGPU] Regenerate test to fix failure

---
 llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
index e22cb912552f97..8cb9e9f992f546 100644
--- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
@@ -69,8 +69,7 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3
   ; REGALLOC-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
-  ; REGALLOC-NEXT:   renamable $sgpr4 = S_MOV_B32 5
-  ; REGALLOC-NEXT:   renamable $vgpr0 = V_MUL_LO_U32_e64 killed $vgpr0, killed $sgpr4, implicit $exec
+  ; REGALLOC-NEXT:   renamable $vgpr0 = V_LSHL_ADD_U32_e64 killed $vgpr0, 2, $vgpr0, implicit $exec
   ; REGALLOC-NEXT:   KILL killed renamable $vgpr1
   ; REGALLOC-NEXT:   SI_RETURN implicit killed $vgpr0
 bb.0:

From 4983432f17eb4b445e161c5f8278c6ea4d5d1241 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann@gmail.com>
Date: Mon, 6 Nov 2023 15:48:03 +0100
Subject: [PATCH 30/40] [MLIR][LLVM] Remove typed pointers from the LLVM
 dialect (#71285)

This commit removes the support for typed pointers from the LLVM
dialect. Typed pointers have been deprecated for a while and thus this
removal was announced in a PSA:

https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502

This change includes:
- Changing the ` LLVMPointerType`
- Removing remaining usages of the builders and the now removed element
type
- Fixing assembly formats that require fully qualified pointer types
- Updating ODS pointer constraints
---
 .../include/mlir/Dialect/LLVMIR/LLVMDialect.h |   3 +-
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   |  28 +-
 .../include/mlir/Dialect/LLVMIR/LLVMOpBase.td |  37 +--
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |  60 +----
 mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td |  21 +-
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   |  54 ++--
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |   2 +-
 .../Conversion/LLVMCommon/TypeConverter.cpp   |  10 -
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    | 244 ++----------------
 mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp |  18 +-
 mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp      |  84 +-----
 .../LLVMIR/Transforms/TypeConsistency.cpp     |  17 +-
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       |  41 ++-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |  27 +-
 mlir/test/Dialect/LLVMIR/global.mlir          |   3 +-
 mlir/test/Dialect/LLVMIR/invalid.mlir         |  61 +----
 .../unittests/Dialect/LLVMIR/LLVMTypeTest.cpp |  34 ---
 17 files changed, 138 insertions(+), 606 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
index bbed1ea5cf6220..06df4a601b7a3f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
@@ -209,8 +209,7 @@ class GEPIndicesAdaptor {
 /// global and use it to compute the address of the first character in the
 /// string (operations inserted at the builder insertion point).
 Value createGlobalString(Location loc, OpBuilder &builder, StringRef name,
-                         StringRef value, Linkage linkage,
-                         bool useOpaquePointers = true);
+                         StringRef value, Linkage linkage);
 
 /// LLVM requires some operations to be inside of a Module operation. This
 /// function confirms that the Operation has the desired properties.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index 72c932ac07a2e1..1123466b7a75e3 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -469,16 +469,16 @@ def LLVM_ThreadlocalAddressOp : LLVM_OneResultIntrOp<"threadlocal.address", [],
 
 def LLVM_CoroIdOp : LLVM_IntrOp<"coro.id", [], [], [], 1> {
   let arguments = (ins I32:$align,
-                       LLVM_i8Ptr:$promise,
-                       LLVM_i8Ptr:$coroaddr,
-                       LLVM_i8Ptr:$fnaddrs);
+                       LLVM_AnyPointer:$promise,
+                       LLVM_AnyPointer:$coroaddr,
+                       LLVM_AnyPointer:$fnaddrs);
   let assemblyFormat = "$align `,` $promise `,` $coroaddr `,` $fnaddrs"
     " attr-dict `:` functional-type(operands, results)";
 }
 
 def LLVM_CoroBeginOp : LLVM_IntrOp<"coro.begin", [], [], [], 1> {
   let arguments = (ins LLVM_TokenType:$token,
-                       LLVM_i8Ptr:$mem);
+                       LLVM_AnyPointer:$mem);
   let assemblyFormat = "$token `,` $mem attr-dict `:` functional-type(operands, results)";
 }
 
@@ -491,7 +491,7 @@ def LLVM_CoroAlignOp : LLVM_IntrOp<"coro.align", [0], [], [], 1> {
 }
 
 def LLVM_CoroSaveOp : LLVM_IntrOp<"coro.save", [], [], [], 1> {
-  let arguments = (ins LLVM_i8Ptr:$handle);
+  let arguments = (ins LLVM_AnyPointer:$handle);
   let assemblyFormat = "$handle attr-dict `:` functional-type(operands, results)";
 }
 
@@ -502,7 +502,7 @@ def LLVM_CoroSuspendOp : LLVM_IntrOp<"coro.suspend", [], [], [], 1> {
 }
 
 def LLVM_CoroEndOp : LLVM_IntrOp<"coro.end", [], [], [], 1> {
-  let arguments = (ins LLVM_i8Ptr:$handle,
+  let arguments = (ins LLVM_AnyPointer:$handle,
                        I1:$unwind,
                        LLVM_TokenType:$retvals);
   let assemblyFormat = "$handle `,` $unwind `,` $retvals attr-dict `:` functional-type(operands, results)";
@@ -510,12 +510,12 @@ def LLVM_CoroEndOp : LLVM_IntrOp<"coro.end", [], [], [], 1> {
 
 def LLVM_CoroFreeOp : LLVM_IntrOp<"coro.free", [], [], [], 1> {
   let arguments = (ins LLVM_TokenType:$id,
-                       LLVM_i8Ptr:$handle);
+                       LLVM_AnyPointer:$handle);
   let assemblyFormat = "$id `,` $handle attr-dict `:` functional-type(operands, results)";
 }
 
 def LLVM_CoroResumeOp : LLVM_IntrOp<"coro.resume", [], [], [], 0> {
-  let arguments = (ins LLVM_i8Ptr:$handle);
+  let arguments = (ins LLVM_AnyPointer:$handle);
   let assemblyFormat = "$handle attr-dict `:` qualified(type($handle))";
 }
 
@@ -591,19 +591,19 @@ def LLVM_DbgLabelOp : LLVM_IntrOp<"dbg.label", [], [], [], 0> {
 //
 
 def LLVM_VaStartOp : LLVM_ZeroResultIntrOp<"vastart">,
-                     Arguments<(ins LLVM_i8Ptr:$arg_list)> {
+                     Arguments<(ins LLVM_AnyPointer:$arg_list)> {
   let assemblyFormat = "$arg_list attr-dict `:` qualified(type($arg_list))";
   let summary = "Initializes `arg_list` for subsequent variadic argument extractions.";
 }
 
 def LLVM_VaCopyOp : LLVM_ZeroResultIntrOp<"vacopy">,
-                    Arguments<(ins LLVM_i8Ptr:$dest_list, LLVM_i8Ptr:$src_list)> {
+                    Arguments<(ins LLVM_AnyPointer:$dest_list, LLVM_AnyPointer:$src_list)> {
   let assemblyFormat = "$src_list `to` $dest_list attr-dict `:` type(operands)";
   let summary = "Copies the current argument position from `src_list` to `dest_list`.";
 }
 
 def LLVM_VaEndOp : LLVM_ZeroResultIntrOp<"vaend">,
-                   Arguments<(ins LLVM_i8Ptr:$arg_list)> {
+                   Arguments<(ins LLVM_AnyPointer:$arg_list)> {
   let assemblyFormat = "$arg_list attr-dict `:` qualified(type($arg_list))";
   let summary = "Destroys `arg_list`, which has been initialized by `intr.vastart` or `intr.vacopy`.";
 }
@@ -613,7 +613,7 @@ def LLVM_VaEndOp : LLVM_ZeroResultIntrOp<"vaend">,
 //
 
 def LLVM_EhTypeidForOp : LLVM_OneResultIntrOp<"eh.typeid.for"> {
-    let arguments = (ins LLVM_i8Ptr:$type_info);
+    let arguments = (ins LLVM_AnyPointer:$type_info);
     let assemblyFormat = "$type_info attr-dict `:` functional-type(operands, results)";
 }
 
@@ -927,12 +927,12 @@ def LLVM_PtrAnnotation
     : LLVM_OneResultIntrOp<"ptr.annotation", [0], [2],
         [AllTypesMatch<["res", "ptr"]>,
          AllTypesMatch<["annotation", "fileName", "attr"]>]> {
-  let arguments = (ins LLVM_PointerTo<AnySignlessInteger>:$ptr,
+  let arguments = (ins LLVM_AnyPointer:$ptr,
                        LLVM_AnyPointer:$annotation,
                        LLVM_AnyPointer:$fileName,
                        I32:$line,
                        LLVM_AnyPointer:$attr);
-  let results = (outs LLVM_PointerTo<AnySignlessInteger>:$res);
+  let results = (outs LLVM_AnyPointer:$res);
 }
 
 def LLVM_Annotation
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index 503988abfc090a..4e42a0e46d9bf9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -55,43 +55,18 @@ def LLVM_AnyFloat : Type<
 def LLVM_AnyPointer : Type<CPred<"::llvm::isa<::mlir::LLVM::LLVMPointerType>($_self)">,
                           "LLVM pointer type", "::mlir::LLVM::LLVMPointerType">;
 
-def LLVM_OpaquePointer : Type<
+// Pointer in a given address space.
+class LLVM_PointerInAddressSpace<int addressSpace> : Type<
   And<[LLVM_AnyPointer.predicate,
-       CPred<"::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).isOpaque()">]>,
-  "LLVM opaque pointer", "::mlir::LLVM::LLVMPointerType">;
-
-// Type constraint accepting LLVM pointer type with an additional constraint
-// on the element type.
-class LLVM_PointerTo<Type pointee> : Type<
-  And<[LLVM_AnyPointer.predicate,
-       Or<[LLVM_OpaquePointer.predicate,
-           SubstLeaves<
-             "$_self",
-             "::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).getElementType()",
-             pointee.predicate>]>]>,
-  "LLVM pointer to " # pointee.summary, "::mlir::LLVM::LLVMPointerType">;
-
-// Opaque pointer in a given address space.
-class LLVM_OpaquePointerInAddressSpace<int addressSpace> : Type<
-  And<[LLVM_OpaquePointer.predicate,
-        CPred<
-          "::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).getAddressSpace() == "
-          # addressSpace>]>,
-  "Opaque LLVM pointer in address space " # addressSpace,
+      CPred<
+        "::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).getAddressSpace() == "
+        # addressSpace>]>,
+  "LLVM pointer in address space " # addressSpace,
   "::mlir::LLVM::LLVMPointerType"> {
   let builderCall = "$_builder.getType<::mlir::LLVM::LLVMPointerType>("
     # addressSpace # ")";
 }
 
-// Type constraints accepting LLVM pointer type to integer of a specific width.
-class LLVM_IntPtrBase<int width, int addressSpace = 0> : Type<
-  And<[LLVM_PointerTo<I<width>>.predicate,
-       CPred<"::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).getAddressSpace()"
-             " == " # addressSpace>]>,
-  "LLVM pointer to " # I<width>.summary>;
-
-def LLVM_i8Ptr : LLVM_IntPtrBase<8>;
-
 // Type constraint accepting any LLVM structure type.
 def LLVM_AnyStruct : Type<CPred<"::llvm::isa<::mlir::LLVM::LLVMStructType>($_self)">,
                          "LLVM structure type">;
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index c8549f146d0297..cb8a6e08886df9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -184,16 +184,14 @@ def LLVM_AllocaOp : LLVM_Op<"alloca",
   LLVM_MemOpPatterns {
   let arguments = (ins AnyInteger:$arraySize,
                    OptionalAttr<I64Attr>:$alignment,
-                   OptionalAttr<TypeAttr>:$elem_type,
+                   TypeAttr:$elem_type,
                    UnitAttr:$inalloca);
   let results = (outs Res<LLVM_AnyPointer, "",
                           [MemAlloc<AutomaticAllocationScopeResource>]>:$res);
   string llvmInstName = "Alloca";
   string llvmBuilder = [{
     auto addrSpace = $_resultType->getPointerAddressSpace();
-    llvm::Type *elementType = moduleTranslation.convertType(
-        $elem_type ? *$elem_type
-                   : ::llvm::cast<LLVMPointerType>(op.getType()).getElementType());
+    llvm::Type *elementType = moduleTranslation.convertType($elem_type);
     auto *inst = builder.CreateAlloca(elementType, addrSpace, $arraySize);
     }] # setAlignmentCode # [{
     inst->setUsedWithInAlloca($inalloca);
@@ -207,31 +205,16 @@ def LLVM_AllocaOp : LLVM_Op<"alloca",
     $res = $_builder.create<LLVM::AllocaOp>(
       $_location, $_resultType, $arraySize,
       alignment == 0 ? IntegerAttr() : $_builder.getI64IntegerAttr(alignment),
-      TypeAttr::get(allocatedType), allocaInst->isUsedWithInAlloca());
+      allocatedType, allocaInst->isUsedWithInAlloca());
   }];
   let builders = [
-    DeprecatedOpBuilder<"the usage of typed pointers is deprecated",
-               (ins "Type":$resultType, "Value":$arraySize,
-               "unsigned":$alignment),
-    [{
-      assert(!::llvm::cast<LLVMPointerType>(resultType).isOpaque() &&
-             "pass the allocated type explicitly if opaque pointers are used");
-      if (alignment == 0)
-        return build($_builder, $_state, resultType, arraySize, IntegerAttr(),
-                     TypeAttr(), false);
-      build($_builder, $_state, resultType, arraySize,
-        $_builder.getI64IntegerAttr(alignment), TypeAttr(), false);
-    }]>,
     OpBuilder<(ins "Type":$resultType, "Type":$elementType, "Value":$arraySize,
                CArg<"unsigned", "0">:$alignment),
     [{
-      TypeAttr elemTypeAttr =
-          ::llvm::cast<LLVMPointerType>(resultType).isOpaque() ?
-          TypeAttr::get(elementType) : TypeAttr();
       build($_builder, $_state, resultType, arraySize,
             alignment == 0 ? IntegerAttr()
                            : $_builder.getI64IntegerAttr(alignment),
-            elemTypeAttr, false);
+            elementType, false);
 
     }]>
     ];
@@ -247,7 +230,7 @@ def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure,
   let arguments = (ins LLVM_ScalarOrVectorOf<LLVM_AnyPointer>:$base,
                    Variadic<LLVM_ScalarOrVectorOf<AnyInteger>>:$dynamicIndices,
                    DenseI32ArrayAttr:$rawConstantIndices,
-                   OptionalAttr<TypeAttr>:$elem_type,
+                   TypeAttr:$elem_type,
                    UnitAttr:$inbounds);
   let results = (outs LLVM_ScalarOrVectorOf<LLVM_AnyPointer>:$res);
   let skipDefaultBuilders = 1;
@@ -282,14 +265,6 @@ def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure,
     OpBuilder<(ins "Type":$resultType, "Type":$basePtrType, "Value":$basePtr,
                "ValueRange":$indices, CArg<"bool", "false">:$inbounds,
                CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
-    DeprecatedOpBuilder<"the usage of typed pointers is deprecated",
-               (ins "Type":$resultType, "Value":$basePtr,
-               "ValueRange":$indices, CArg<"bool", "false">:$inbounds,
-               CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
-    DeprecatedOpBuilder<"the usage of typed pointers is deprecated",
-               (ins "Type":$resultType, "Value":$basePtr,
-               "ArrayRef<GEPArg>":$indices, CArg<"bool", "false">:$inbounds,
-               CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
     OpBuilder<(ins "Type":$resultType, "Type":$basePtrType, "Value":$basePtr,
                "ArrayRef<GEPArg>":$indices, CArg<"bool", "false">:$inbounds,
                CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
@@ -306,22 +281,19 @@ def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure,
         indices.push_back(
             builder.getInt32(valueOrAttr.get<IntegerAttr>().getInt()));
     }
-    Type baseElementType = op.getSourceElementType();
+    Type baseElementType = op.getElemType();
     llvm::Type *elementType = moduleTranslation.convertType(baseElementType);
     $res = builder.CreateGEP(elementType, $base, indices, "", $inbounds);
   }];
   let assemblyFormat = [{
     (`inbounds` $inbounds^)?
     $base `[` custom<GEPIndices>($dynamicIndices, $rawConstantIndices) `]` attr-dict
-    `:` functional-type(operands, results) (`,` $elem_type^)?
+    `:` functional-type(operands, results) `,` $elem_type
   }];
 
   let extraClassDeclaration = [{
     constexpr static int32_t kDynamicIndex = std::numeric_limits<int32_t>::min();
 
-    /// Returns the type pointed to by the pointer argument of this GEP.
-    Type getSourceElementType();
-
     GEPIndicesAdaptor<ValueRange> getIndices();
   }];
   let hasFolder = 1;
@@ -332,7 +304,7 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
     [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
      DeclareOpInterfaceMethods<PromotableMemOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>]> {
-  dag args = (ins LLVM_PointerTo<LLVM_LoadableType>:$addr,
+  dag args = (ins LLVM_AnyPointer:$addr,
               OptionalAttr<I64Attr>:$alignment,
               UnitAttr:$volatile_,
               UnitAttr:$nontemporal,
@@ -370,7 +342,7 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
   let assemblyFormat = [{
     (`volatile` $volatile_^)? $addr
     (`atomic` (`syncscope` `(` $syncscope^ `)`)? $ordering^)?
-    attr-dict `:` custom<LoadType>(type($addr), type($res))
+    attr-dict `:` qualified(type($addr)) `->` type($res)
   }];
   string llvmBuilder = [{
     auto *inst = builder.CreateLoad($_resultType, $addr, $volatile_);
@@ -391,9 +363,6 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
         getLLVMSyncScope(loadInst));
   }];
   let builders = [
-    DeprecatedOpBuilder<"the usage of typed pointers is deprecated",
-      (ins "Value":$addr, CArg<"unsigned", "0">:$alignment,
-      CArg<"bool", "false">:$isVolatile, CArg<"bool", "false">:$isNonTemporal)>,
     OpBuilder<(ins "Type":$type, "Value":$addr,
       CArg<"unsigned", "0">:$alignment, CArg<"bool", "false">:$isVolatile,
       CArg<"bool", "false">:$isNonTemporal,
@@ -408,7 +377,7 @@ def LLVM_StoreOp : LLVM_MemAccessOpBase<"store",
      DeclareOpInterfaceMethods<PromotableMemOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>]> {
   dag args = (ins LLVM_LoadableType:$value,
-              LLVM_PointerTo<LLVM_LoadableType>:$addr,
+              LLVM_AnyPointer:$addr,
               OptionalAttr<I64Attr>:$alignment,
               UnitAttr:$volatile_,
               UnitAttr:$nontemporal,
@@ -445,7 +414,7 @@ def LLVM_StoreOp : LLVM_MemAccessOpBase<"store",
   let assemblyFormat = [{
     (`volatile` $volatile_^)? $value `,` $addr
     (`atomic` (`syncscope` `(` $syncscope^ `)`)? $ordering^)?
-    attr-dict `:` custom<StoreType>(type($value), type($addr))
+    attr-dict `:` type($value) `,` qualified(type($addr))
   }];
   string llvmBuilder = [{
     auto *inst = builder.CreateStore($value, $addr, $volatile_);
@@ -651,8 +620,7 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
     OpBuilder<(ins "LLVMFunctionType":$calleeType, "FlatSymbolRefAttr":$callee,
                    CArg<"ValueRange", "{}">:$args)>,
     OpBuilder<(ins "LLVMFunctionType":$calleeType, "StringRef":$callee,
-                   CArg<"ValueRange", "{}">:$args)>,
-    OpBuilder<(ins "Value":$callee, "ValueRange":$args)>
+                   CArg<"ValueRange", "{}">:$args)>
   ];
   let hasCustomAssemblyFormat = 1;
   let extraClassDeclaration = [{
@@ -1636,7 +1604,7 @@ def LLVM_AtomicRMWOp : LLVM_MemAccessOpBase<"atomicrmw", [
       TypesMatchWith<"result #0 and operand #1 have the same type",
                      "val", "res", "$_self">]> {
   dag args = (ins AtomicBinOp:$bin_op,
-              LLVM_PointerTo<LLVM_AtomicRMWType>:$ptr,
+              LLVM_AnyPointer:$ptr,
               LLVM_AtomicRMWType:$val, AtomicOrdering:$ordering,
               OptionalAttr<StrAttr>:$syncscope,
               OptionalAttr<I64Attr>:$alignment,
@@ -1687,7 +1655,7 @@ def LLVM_AtomicCmpXchgOp : LLVM_MemAccessOpBase<"cmpxchg", [
       TypesMatchWith<"result #0 has an LLVM struct type consisting of "
                      "the type of operand #2 and a bool", "val", "res",
                      "getValAndBoolStructType($_self)">]> {
-  dag args = (ins LLVM_PointerTo<LLVM_AtomicCmpXchgType>:$ptr,
+  dag args = (ins LLVM_AnyPointer:$ptr,
               LLVM_AtomicCmpXchgType:$cmp, LLVM_AtomicCmpXchgType:$val,
               AtomicOrdering:$success_ordering,
               AtomicOrdering:$failure_ordering,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
index e31029bfed5a54..0bd068c1be7c90 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
@@ -137,30 +137,17 @@ def LLVMPointerType : LLVMType<"LLVMPointer", "ptr", [
     ```
   }];
 
-  let parameters = (ins DefaultValuedParameter<"Type", "Type()">:$elementType,
-                        DefaultValuedParameter<"unsigned", "0">:$addressSpace);
+  let parameters = (ins DefaultValuedParameter<"unsigned", "0">:$addressSpace);
   let assemblyFormat = [{
-    (`<` custom<Pointer>($elementType, $addressSpace)^ `>`)?
+    (`<` $addressSpace^ `>`)?
   }];
 
-  let genVerifyDecl = 1;
-
+  let skipDefaultBuilders = 1;
   let builders = [
-    TypeBuilderWithInferredContext<(ins "Type":$elementType,
-                                         CArg<"unsigned", "0">:$addressSpace)>,
     TypeBuilder<(ins CArg<"unsigned", "0">:$addressSpace), [{
-      return $_get($_ctxt, Type(), addressSpace);
+      return $_get($_ctxt, addressSpace);
     }]>
   ];
-
-  let extraClassDeclaration = [{
-    /// Returns `true` if this type is the opaque pointer type, i.e., it has no
-    /// pointed-to type.
-    bool isOpaque() const { return !getElementType(); }
-
-    /// Checks if the given type can have a pointer type pointing to it.
-    static bool isValidElementType(Type type);
-  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index c49decde1638b1..ffe6f25fcd944b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -19,10 +19,8 @@ include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"
 
-def LLVM_i8Ptr_global : LLVM_IntPtrBase<8, 1>;
-def LLVM_i8Ptr_shared : LLVM_IntPtrBase<8, 3>;
-def LLVM_i64ptr_any : LLVM_IntPtrBase<64>;
-def LLVM_i64ptr_shared : LLVM_IntPtrBase<64, 3>;
+def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
+def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
 
 //===----------------------------------------------------------------------===//
 // NVVM dialect definitions
@@ -213,7 +211,7 @@ def NVVM_ReduxOp :
 
 /// mbarrier.init instruction with generic pointer type
 def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
-  Arguments<(ins LLVM_i64ptr_any:$addr, I32:$count, PtxPredicate:$predicate)> {
+  Arguments<(ins LLVM_AnyPointer:$addr, I32:$count, PtxPredicate:$predicate)> {
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init, {$addr, $count});
   }];
@@ -228,7 +226,7 @@ def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
 
 /// mbarrier.init instruction with shared pointer type
 def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared">,
-  Arguments<(ins LLVM_i64ptr_shared:$addr, I32:$count, PtxPredicate:$predicate)> {
+  Arguments<(ins LLVM_PointerShared:$addr, I32:$count, PtxPredicate:$predicate)> {
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init_shared, {$addr, $count});
   }];
@@ -240,7 +238,7 @@ def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared">,
 }
 
 def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
-  Arguments<(ins LLVM_i64ptr_any:$addr)> {
+  Arguments<(ins LLVM_AnyPointer:$addr)> {
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval, {$addr});
   }];
@@ -248,7 +246,7 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
 }
 
 def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">,
-  Arguments<(ins LLVM_i64ptr_shared:$addr)> {
+  Arguments<(ins LLVM_PointerShared:$addr)> {
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval_shared, {$addr});
   }];
@@ -257,7 +255,7 @@ def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">,
 
 def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
   Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_i64ptr_any:$addr)> {
+  Arguments<(ins LLVM_AnyPointer:$addr)> {
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive, {$addr});
   }];
@@ -266,16 +264,16 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
 
 def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
   Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_i64ptr_shared:$addr)> {
+  Arguments<(ins LLVM_PointerShared:$addr)> {
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_shared, {$addr});
   }];
-  let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)";
+  let assemblyFormat = "$addr attr-dict `:` qualified(type($addr)) `->` type($res)";
 }
 
 def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
   Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_i64ptr_any:$addr, I32:$count)> {
+  Arguments<(ins LLVM_AnyPointer:$addr, I32:$count)> {
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete, {$addr, $count});
   }];
@@ -284,7 +282,7 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
 
 def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.shared">,
   Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_i64ptr_shared:$addr, I32:$count)> {
+  Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> {
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared, {$addr, $count});
   }];
@@ -292,7 +290,7 @@ def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.
 }
 
 def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,  
-  Arguments<(ins LLVM_i64ptr_any:$addr, I32:$txcount, PtxPredicate:$predicate)> {
+  Arguments<(ins LLVM_AnyPointer:$addr, I32:$txcount, PtxPredicate:$predicate)> {
   let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() { return std::string("mbarrier.arrive.expect_tx.b64 _, [%0], %1;"); }
@@ -300,7 +298,7 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
 }
 
 def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">,  
-  Arguments<(ins LLVM_i64ptr_shared:$addr, I32:$txcount, PtxPredicate:$predicate)> {    
+  Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {    
   let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() { return std::string("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"); }
@@ -308,7 +306,7 @@ def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.ex
 }
 
 def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,  
-  Arguments<(ins LLVM_i64ptr_any:$addr, I32:$phase, I32:$ticks)> {  
+  Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> {  
   let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -327,7 +325,7 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
 }
 
 def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">,  
-  Arguments<(ins LLVM_i64ptr_shared:$addr, I32:$phase, I32:$ticks)> {  
+  Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {  
   let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() {
@@ -347,7 +345,7 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p
 
 def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
   Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_i64ptr_any:$addr, LLVM_Type:$state)> {
+  Arguments<(ins LLVM_AnyPointer:$addr, LLVM_Type:$state)> {
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state});
   }];
@@ -356,7 +354,7 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
 
 def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,
   Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_i64ptr_shared:$addr, LLVM_Type:$state)> {
+  Arguments<(ins LLVM_PointerShared:$addr, LLVM_Type:$state)> {
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state});
   }];
@@ -501,8 +499,8 @@ def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind",
 def LoadCacheModifierAttr : EnumAttr<NVVM_Dialect, LoadCacheModifierKind, "load_cache_modifier">;
 
 def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">,
-  Arguments<(ins LLVM_i8Ptr_shared:$dst,
-                 LLVM_i8Ptr_global:$src,
+  Arguments<(ins LLVM_PointerShared:$dst,
+                 LLVM_PointerGlobal:$src,
                  I32Attr:$size,
                  LoadCacheModifierAttr:$modifier,
                  Optional<LLVM_Type>:$cpSize)> {
@@ -1187,7 +1185,7 @@ def NVVM_WMMAMmaOp : NVVM_Op<"wmma.mma">,
 }
 
 def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">, 
-  Arguments<(ins LLVM_i8Ptr_shared:$ptr, 
+  Arguments<(ins LLVM_PointerShared:$ptr, 
                  Variadic<I32>:$sources, 
                  MMALayoutAttr:$layout)> {
   let summary = "cooperative matrix store";
@@ -1404,9 +1402,9 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
   NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global", 
   [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, 
   AttrSizedOperandSegments]>,
-  Arguments<(ins  LLVM_i64ptr_shared:$dstMem,
-                  LLVM_i64ptr_any:$tmaDescriptor,
-                  LLVM_i64ptr_shared:$mbar,
+  Arguments<(ins  LLVM_PointerShared:$dstMem,
+                  LLVM_AnyPointer:$tmaDescriptor,
+                  LLVM_PointerShared:$mbar,
                   Variadic<I32>:$coordinates,
                   PtxPredicate:$predicate)> {
   let assemblyFormat = [{ 
@@ -1439,8 +1437,8 @@ def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp :
   NVVM_Op<"cp.async.bulk.tensor.global.shared.cta", 
   [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, 
   AttrSizedOperandSegments]>,
-  Arguments<(ins  LLVM_i64ptr_any:$tmaDescriptor,
-                  LLVM_i64ptr_shared:$srcMem,
+  Arguments<(ins  LLVM_AnyPointer:$tmaDescriptor,
+                  LLVM_PointerShared:$srcMem,
                   Variadic<I32>:$coordinates,
                   PtxPredicate:$predicate)> {
   let assemblyFormat = [{ 
@@ -1469,7 +1467,7 @@ def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp :
 
 def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
                     [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
-  Arguments<(ins LLVM_i64ptr_any:$tmaDescriptor, PtxPredicate:$predicate)> {
+  Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, PtxPredicate:$predicate)> {
   let assemblyFormat = "$tmaDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
   let extraClassDefinition = [{
     std::string $cppClass::getPtx() { 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 6c6419bf238b45..48b830ae34f292 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -275,7 +275,7 @@ def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">;
 // raw buffer mode).
 //===---------------------------------------------------------------------===//
 
-def ROCDLBufferRsrc : LLVM_OpaquePointerInAddressSpace<8>;
+def ROCDLBufferRsrc : LLVM_PointerInAddressSpace<8>;
 
 def ROCDL_MakeBufferRsrcOp :
   ROCDL_IntrOp<"make.buffer.rsrc", [], [0], [Pure], 1>,
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index 35b95d7a5ebe92..04496d6b8f6344 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -76,16 +76,6 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
                                         : std::nullopt;
   });
 
-  // LLVM container types may (recursively) contain other types that must be
-  // converted even when the outer type is compatible.
-  addConversion([&](LLVM::LLVMPointerType type) -> std::optional<Type> {
-    if (type.isOpaque())
-      return type;
-    if (auto pointee = convertType(type.getElementType()))
-      return LLVM::LLVMPointerType::get(pointee, type.getAddressSpace());
-    return std::nullopt;
-  });
-
   addConversion([&](LLVM::LLVMStructType type, SmallVectorImpl<Type> &results)
                     -> std::optional<LogicalResult> {
     // Fastpath for types that won't be converted by this callback anyway.
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 7f5681e7bdc059..c22cff4c1328a5 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -216,17 +216,13 @@ OpFoldResult ICmpOp::fold(FoldAdaptor adaptor) {
 //===----------------------------------------------------------------------===//
 
 void AllocaOp::print(OpAsmPrinter &p) {
-  Type elemTy = llvm::cast<LLVM::LLVMPointerType>(getType()).getElementType();
-  if (!elemTy)
-    elemTy = *getElemType();
-
   auto funcTy =
       FunctionType::get(getContext(), {getArraySize().getType()}, {getType()});
 
   if (getInalloca())
     p << " inalloca";
 
-  p << ' ' << getArraySize() << " x " << elemTy;
+  p << ' ' << getArraySize() << " x " << getElemType();
   if (getAlignment() && *getAlignment() != 0)
     p.printOptionalAttrDict((*this)->getAttrs(),
                             {kElemTypeAttrName, getInallocaAttrName()});
@@ -277,40 +273,16 @@ ParseResult AllocaOp::parse(OpAsmParser &parser, OperationState &result) {
     return failure();
 
   Type resultType = funcType.getResult(0);
-  if (auto ptrResultType = llvm::dyn_cast<LLVMPointerType>(resultType)) {
-    if (ptrResultType.isOpaque())
-      result.addAttribute(kElemTypeAttrName, TypeAttr::get(elemType));
-  }
+  if (auto ptrResultType = llvm::dyn_cast<LLVMPointerType>(resultType))
+    result.addAttribute(kElemTypeAttrName, TypeAttr::get(elemType));
 
   result.addTypes({funcType.getResult(0)});
   return success();
 }
 
-/// Checks that the elemental type is present in either the pointer type or
-/// the attribute, but not both.
-static LogicalResult verifyOpaquePtr(Operation *op, LLVMPointerType ptrType,
-                                     std::optional<Type> ptrElementType) {
-  if (ptrType.isOpaque() && !ptrElementType.has_value()) {
-    return op->emitOpError() << "expected '" << kElemTypeAttrName
-                             << "' attribute if opaque pointer type is used";
-  }
-  if (!ptrType.isOpaque() && ptrElementType.has_value()) {
-    return op->emitOpError()
-           << "unexpected '" << kElemTypeAttrName
-           << "' attribute when non-opaque pointer type is used";
-  }
-  return success();
-}
-
 LogicalResult AllocaOp::verify() {
-  LLVMPointerType ptrType = llvm::cast<LLVMPointerType>(getType());
-  if (failed(verifyOpaquePtr(getOperation(), ptrType, getElemType())))
-    return failure();
-
-  Type elemTy =
-      (ptrType.isOpaque()) ? *getElemType() : ptrType.getElementType();
   // Only certain target extension types can be used in 'alloca'.
-  if (auto targetExtType = dyn_cast<LLVMTargetExtType>(elemTy);
+  if (auto targetExtType = dyn_cast<LLVMTargetExtType>(getElemType());
       targetExtType && !targetExtType.supportsMemOps())
     return emitOpError()
            << "this target extension type cannot be used in alloca";
@@ -318,11 +290,7 @@ LogicalResult AllocaOp::verify() {
   return success();
 }
 
-Type AllocaOp::getResultPtrElementType() {
-  // This will become trivial once non-opaque pointers are gone.
-  return getElemType().has_value() ? *getElemType()
-                                   : getResult().getType().getElementType();
-}
+Type AllocaOp::getResultPtrElementType() { return getElemType(); }
 
 //===----------------------------------------------------------------------===//
 // LLVM::BrOp
@@ -525,18 +493,6 @@ static Type extractVectorElementType(Type type) {
   return type;
 }
 
-void GEPOp::build(OpBuilder &builder, OperationState &result, Type resultType,
-                  Value basePtr, ArrayRef<GEPArg> indices, bool inbounds,
-                  ArrayRef<NamedAttribute> attributes) {
-  auto ptrType =
-      llvm::cast<LLVMPointerType>(extractVectorElementType(basePtr.getType()));
-  assert(!ptrType.isOpaque() &&
-         "expected non-opaque pointer, provide elementType explicitly when "
-         "opaque pointers are used");
-  build(builder, result, resultType, ptrType.getElementType(), basePtr, indices,
-        inbounds, attributes);
-}
-
 /// Destructures the 'indices' parameter into 'rawConstantIndices' and
 /// 'dynamicIndices', encoding the former in the process. In the process,
 /// dynamic indices which are used to index into a structure type are converted
@@ -602,20 +558,11 @@ void GEPOp::build(OpBuilder &builder, OperationState &result, Type resultType,
     result.addAttribute(getInboundsAttrName(result.name),
                         builder.getUnitAttr());
   }
-  if (llvm::cast<LLVMPointerType>(extractVectorElementType(basePtr.getType()))
-          .isOpaque())
-    result.addAttribute(kElemTypeAttrName, TypeAttr::get(elementType));
+  result.addAttribute(kElemTypeAttrName, TypeAttr::get(elementType));
   result.addOperands(basePtr);
   result.addOperands(dynamicIndices);
 }
 
-void GEPOp::build(OpBuilder &builder, OperationState &result, Type resultType,
-                  Value basePtr, ValueRange indices, bool inbounds,
-                  ArrayRef<NamedAttribute> attributes) {
-  build(builder, result, resultType, basePtr, SmallVector<GEPArg>(indices),
-        inbounds, attributes);
-}
-
 void GEPOp::build(OpBuilder &builder, OperationState &result, Type resultType,
                   Type elementType, Value basePtr, ValueRange indices,
                   bool inbounds, ArrayRef<NamedAttribute> attributes) {
@@ -712,35 +659,20 @@ verifyStructIndices(Type baseGEPType, GEPIndicesAdaptor<ValueRange> indices,
 }
 
 LogicalResult LLVM::GEPOp::verify() {
-  if (failed(verifyOpaquePtr(
-          getOperation(),
-          llvm::cast<LLVMPointerType>(extractVectorElementType(getType())),
-          getElemType())))
-    return failure();
-
   if (static_cast<size_t>(
           llvm::count(getRawConstantIndices(), kDynamicIndex)) !=
       getDynamicIndices().size())
     return emitOpError("expected as many dynamic indices as specified in '")
            << getRawConstantIndicesAttrName().getValue() << "'";
 
-  return verifyStructIndices(getSourceElementType(), getIndices(),
+  return verifyStructIndices(getElemType(), getIndices(),
                              [&] { return emitOpError(); });
 }
 
-Type LLVM::GEPOp::getSourceElementType() {
-  if (std::optional<Type> elemType = getElemType())
-    return *elemType;
-
-  return llvm::cast<LLVMPointerType>(
-             extractVectorElementType(getBase().getType()))
-      .getElementType();
-}
-
 Type GEPOp::getResultPtrElementType() {
   // Set the initial type currently being used for indexing. This will be
   // updated as the indices get walked over.
-  Type selectedType = getSourceElementType();
+  Type selectedType = getElemType();
 
   // Follow the indexed elements in the gep.
   auto indices = getIndices();
@@ -836,14 +768,6 @@ LogicalResult LoadOp::verify() {
                            {AtomicOrdering::release, AtomicOrdering::acq_rel});
 }
 
-void LoadOp::build(OpBuilder &builder, OperationState &state, Value addr,
-                   unsigned alignment, bool isVolatile, bool isNonTemporal) {
-  auto type = llvm::cast<LLVMPointerType>(addr.getType()).getElementType();
-  assert(type && "must provide explicit element type to the constructor "
-                 "when the pointer type is opaque");
-  build(builder, state, type, addr, alignment, isVolatile, isNonTemporal);
-}
-
 void LoadOp::build(OpBuilder &builder, OperationState &state, Type type,
                    Value addr, unsigned alignment, bool isVolatile,
                    bool isNonTemporal, AtomicOrdering ordering,
@@ -857,51 +781,6 @@ void LoadOp::build(OpBuilder &builder, OperationState &state, Type type,
         /*tbaa=*/nullptr);
 }
 
-// Extract the pointee type from the LLVM pointer type wrapped in MLIR. Return
-// the resulting type if any, null type if opaque pointers are used, and
-// std::nullopt if the given type is not the pointer type.
-static std::optional<Type>
-getLoadStoreElementType(OpAsmParser &parser, Type type, SMLoc trailingTypeLoc) {
-  auto llvmTy = llvm::dyn_cast<LLVM::LLVMPointerType>(type);
-  if (!llvmTy) {
-    parser.emitError(trailingTypeLoc, "expected LLVM pointer type");
-    return std::nullopt;
-  }
-  return llvmTy.getElementType();
-}
-
-/// Parses the LoadOp type either using the typed or opaque pointer format.
-// TODO: Drop once the typed pointer assembly format is not needed anymore.
-static ParseResult parseLoadType(OpAsmParser &parser, Type &type,
-                                 Type &elementType) {
-  SMLoc trailingTypeLoc;
-  if (parser.getCurrentLocation(&trailingTypeLoc) || parser.parseType(type))
-    return failure();
-
-  std::optional<Type> pointerElementType =
-      getLoadStoreElementType(parser, type, trailingTypeLoc);
-  if (!pointerElementType)
-    return failure();
-  if (*pointerElementType) {
-    elementType = *pointerElementType;
-    return success();
-  }
-
-  if (parser.parseArrow() || parser.parseType(elementType))
-    return failure();
-  return success();
-}
-
-/// Prints the LoadOp type either using the typed or opaque pointer format.
-// TODO: Drop once the typed pointer assembly format is not needed anymore.
-static void printLoadType(OpAsmPrinter &printer, Operation *op, Type type,
-                          Type elementType) {
-  printer << type;
-  auto pointerType = cast<LLVMPointerType>(type);
-  if (pointerType.isOpaque())
-    printer << " -> " << elementType;
-}
-
 //===----------------------------------------------------------------------===//
 // StoreOp
 //===----------------------------------------------------------------------===//
@@ -940,38 +819,6 @@ void StoreOp::build(OpBuilder &builder, OperationState &state, Value value,
         /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
 
-/// Parses the StoreOp type either using the typed or opaque pointer format.
-// TODO: Drop once the typed pointer assembly format is not needed anymore.
-static ParseResult parseStoreType(OpAsmParser &parser, Type &elementType,
-                                  Type &type) {
-  SMLoc trailingTypeLoc;
-  if (parser.getCurrentLocation(&trailingTypeLoc) ||
-      parser.parseType(elementType))
-    return failure();
-
-  if (succeeded(parser.parseOptionalComma()))
-    return parser.parseType(type);
-
-  // Extract the element type from the pointer type.
-  type = elementType;
-  std::optional<Type> pointerElementType =
-      getLoadStoreElementType(parser, type, trailingTypeLoc);
-  if (!pointerElementType)
-    return failure();
-  elementType = *pointerElementType;
-  return success();
-}
-
-/// Prints the StoreOp type either using the typed or opaque pointer format.
-// TODO: Drop once the typed pointer assembly format is not needed anymore.
-static void printStoreType(OpAsmPrinter &printer, Operation *op,
-                           Type elementType, Type type) {
-  auto pointerType = cast<LLVMPointerType>(type);
-  if (pointerType.isOpaque())
-    printer << elementType << ", ";
-  printer << type;
-}
-
 //===----------------------------------------------------------------------===//
 // CallOp
 //===----------------------------------------------------------------------===//
@@ -1055,22 +902,6 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
-
-void CallOp::build(OpBuilder &builder, OperationState &state, Value callee,
-                   ValueRange args) {
-  auto calleeType = cast<LLVMFunctionType>(
-      cast<LLVMPointerType>(callee.getType()).getElementType());
-  SmallVector<Value> operands;
-  operands.reserve(1 + args.size());
-  operands.push_back(callee);
-  llvm::append_range(operands, args);
-  return build(builder, state, getCallOpResultTypes(calleeType),
-               TypeAttr::get(calleeType), FlatSymbolRefAttr(), operands,
-               /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
-               /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
-               /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
-}
-
 CallInterfaceCallable CallOp::getCallableForCallee() {
   // Direct call.
   if (FlatSymbolRefAttr calleeAttr = getCalleeAttr())
@@ -1145,10 +976,7 @@ LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
       return emitOpError("indirect call expects a pointer as callee: ")
              << getOperand(0).getType();
 
-    if (ptrType.isOpaque())
-      return success();
-
-    fnType = ptrType.getElementType();
+    return success();
   } else {
     Operation *callee =
         symbolTable.lookupNearestSymbolFrom(*this, calleeName.getAttr());
@@ -1848,17 +1676,6 @@ AddressOfOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
     return emitOpError("pointer address space must match address space of the "
                        "referenced global");
 
-  if (type.isOpaque())
-    return success();
-
-  if (global && type.getElementType() != global.getType())
-    return emitOpError(
-        "the type must be a pointer to the type of the referenced global");
-
-  if (function && type.getElementType() != function.getFunctionType())
-    return emitOpError(
-        "the type must be a pointer to the type of the referenced function");
-
   return success();
 }
 
@@ -2135,9 +1952,13 @@ static bool isZeroAttribute(Attribute value) {
 }
 
 LogicalResult GlobalOp::verify() {
-  if (!LLVMPointerType::isValidElementType(getType()))
+  bool validType = isCompatibleOuterType(getType())
+                       ? !llvm::isa<LLVMVoidType, LLVMTokenType,
+                                    LLVMMetadataType, LLVMLabelType>(getType())
+                       : llvm::isa<PointerElementTypeInterface>(getType());
+  if (!validType)
     return emitOpError(
-        "expects type to be a valid element type for an LLVM pointer");
+        "expects type to be a valid element type for an LLVM global");
   if ((*this)->getParentOp() && !satisfiesLLVMModule((*this)->getParentOp()))
     return emitOpError("must appear at the module level");
 
@@ -2733,11 +2554,7 @@ void AtomicRMWOp::build(OpBuilder &builder, OperationState &state,
 }
 
 LogicalResult AtomicRMWOp::verify() {
-  auto ptrType = llvm::cast<LLVM::LLVMPointerType>(getPtr().getType());
   auto valType = getVal().getType();
-  if (!ptrType.isOpaque() && valType != ptrType.getElementType())
-    return emitOpError("expected LLVM IR element type for operand #0 to "
-                       "match type for operand #1");
   if (getBinOp() == AtomicBinOp::fadd || getBinOp() == AtomicBinOp::fsub ||
       getBinOp() == AtomicBinOp::fmin || getBinOp() == AtomicBinOp::fmax) {
     if (!mlir::LLVM::isCompatibleFloatingPointType(valType))
@@ -2790,9 +2607,6 @@ LogicalResult AtomicCmpXchgOp::verify() {
   if (!ptrType)
     return emitOpError("expected LLVM IR pointer type for operand #0");
   auto valType = getVal().getType();
-  if (!ptrType.isOpaque() && valType != ptrType.getElementType())
-    return emitOpError("expected LLVM IR element type for operand #0 to "
-                       "match type for all other operands");
   if (!isTypeCompatibleWithAtomicOp(valType,
                                     /*isPointerTypeAllowed=*/true))
     return emitOpError("unexpected LLVM IR type");
@@ -2987,7 +2801,7 @@ OpFoldResult LLVM::GEPOp::fold(FoldAdaptor adaptor) {
   if (changed) {
     SmallVector<int32_t> rawConstantIndices;
     SmallVector<Value> dynamicIndices;
-    destructureIndices(getSourceElementType(), gepArgs, rawConstantIndices,
+    destructureIndices(getElemType(), gepArgs, rawConstantIndices,
                        dynamicIndices);
 
     getDynamicIndicesMutable().assign(dynamicIndices);
@@ -3185,14 +2999,7 @@ LogicalResult LLVMDialect::verifyParameterAttribute(Operation *op,
   auto checkPointerTypeMatches = [&]() -> LogicalResult {
     if (failed(checkPointerType()))
       return failure();
-    auto ptrType = llvm::cast<LLVMPointerType>(paramType);
-    auto typeAttr = llvm::cast<TypeAttr>(paramAttr.getValue());
 
-    if (!ptrType.isOpaque() && ptrType.getElementType() != typeAttr.getValue())
-      return op->emitError()
-             << name
-             << " attribute attached to LLVM pointer argument of "
-                "different type";
     return success();
   };
 
@@ -3317,8 +3124,7 @@ Operation *LLVMDialect::materializeConstant(OpBuilder &builder, Attribute value,
 
 Value mlir::LLVM::createGlobalString(Location loc, OpBuilder &builder,
                                      StringRef name, StringRef value,
-                                     LLVM::Linkage linkage,
-                                     bool useOpaquePointers) {
+                                     LLVM::Linkage linkage) {
   assert(builder.getInsertionBlock() &&
          builder.getInsertionBlock()->getParentOp() &&
          "expected builder to point to a block constrained in an op");
@@ -3334,19 +3140,11 @@ Value mlir::LLVM::createGlobalString(Location loc, OpBuilder &builder,
       loc, type, /*isConstant=*/true, linkage, name,
       builder.getStringAttr(value), /*alignment=*/0);
 
-  LLVMPointerType resultType;
-  LLVMPointerType charPtr;
-  if (!useOpaquePointers) {
-    resultType = LLVMPointerType::get(type);
-    charPtr = LLVMPointerType::get(IntegerType::get(ctx, 8));
-  } else {
-    resultType = charPtr = LLVMPointerType::get(ctx);
-  }
-
+  LLVMPointerType ptrType = LLVMPointerType::get(ctx);
   // Get the pointer to the first character in the global string.
-  Value globalPtr = builder.create<LLVM::AddressOfOp>(loc, resultType,
-                                                      global.getSymNameAttr());
-  return builder.create<LLVM::GEPOp>(loc, charPtr, type, globalPtr,
+  Value globalPtr =
+      builder.create<LLVM::AddressOfOp>(loc, ptrType, global.getSymNameAttr());
+  return builder.create<LLVM::GEPOp>(loc, ptrType, type, globalPtr,
                                      ArrayRef<GEPArg>{0, 0});
 }
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index 2cdc5d9fff84e7..cc2a917e58f639 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -32,7 +32,7 @@ llvm::SmallVector<MemorySlot> LLVM::AllocaOp::getPromotableSlots() {
   if (!getOperation()->getBlock()->isEntryBlock())
     return {};
 
-  return {MemorySlot{getResult(), getResultPtrElementType()}};
+  return {MemorySlot{getResult(), getElemType()}};
 }
 
 Value LLVM::AllocaOp::getDefaultValue(const MemorySlot &slot,
@@ -61,8 +61,7 @@ SmallVector<DestructurableMemorySlot> LLVM::AllocaOp::getDestructurableSlots() {
   if (!mlir::matchPattern(getArraySize(), m_One()))
     return {};
 
-  Type elemType = getResultPtrElementType();
-  auto destructurable = dyn_cast<DestructurableTypeInterface>(elemType);
+  auto destructurable = dyn_cast<DestructurableTypeInterface>(getElemType());
   if (!destructurable)
     return {};
 
@@ -75,7 +74,8 @@ SmallVector<DestructurableMemorySlot> LLVM::AllocaOp::getDestructurableSlots() {
   for (Attribute index : llvm::make_first_range(destructuredType.value()))
     allocaTypeMap.insert({index, LLVM::LLVMPointerType::get(getContext())});
 
-  return {DestructurableMemorySlot{{getResult(), elemType}, {allocaTypeMap}}};
+  return {
+      DestructurableMemorySlot{{getResult(), getElemType()}, {allocaTypeMap}}};
 }
 
 DenseMap<Attribute, MemorySlot>
@@ -83,12 +83,9 @@ LLVM::AllocaOp::destructure(const DestructurableMemorySlot &slot,
                             const SmallPtrSetImpl<Attribute> &usedIndices,
                             RewriterBase &rewriter) {
   assert(slot.ptr == getResult());
-  Type elemType =
-      getElemType() ? *getElemType() : getResult().getType().getElementType();
-
   rewriter.setInsertionPointAfter(*this);
 
-  auto destructurableType = cast<DestructurableTypeInterface>(elemType);
+  auto destructurableType = cast<DestructurableTypeInterface>(getElemType());
   DenseMap<Attribute, MemorySlot> slotMap;
   for (Attribute index : usedIndices) {
     Type elemType = destructurableType.getTypeAtIndex(index);
@@ -337,11 +334,6 @@ bool LLVM::GEPOp::canRewire(const DestructurableMemorySlot &slot,
   if (!basePtrType)
     return false;
 
-  // Typed pointers are not supported. This should be removed once typed
-  // pointers are removed from the LLVM dialect.
-  if (!basePtrType.isOpaque())
-    return false;
-
   if (getBase() != slot.ptr || slot.elemType != getElemType())
     return false;
   if (!isFirstIndexZero(*this))
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index bc8300a8b7329e..8841aa8362569a 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -75,40 +75,6 @@ static void printFunctionTypes(AsmPrinter &p, ArrayRef<Type> params,
   p << ')';
 }
 
-//===----------------------------------------------------------------------===//
-// custom<Pointer>
-//===----------------------------------------------------------------------===//
-
-static ParseResult parsePointer(AsmParser &p, Type &elementType,
-                                unsigned &addressSpace) {
-  // `<` addressSpace `>`
-  OptionalParseResult result = p.parseOptionalInteger(addressSpace);
-  if (result.has_value()) {
-    if (failed(result.value()))
-      return failure();
-    elementType = Type();
-    return success();
-  }
-
-  if (parsePrettyLLVMType(p, elementType))
-    return failure();
-  if (succeeded(p.parseOptionalComma()))
-    return p.parseInteger(addressSpace);
-
-  return success();
-}
-
-static void printPointer(AsmPrinter &p, Type elementType,
-                         unsigned addressSpace) {
-  if (elementType)
-    printPrettyLLVMType(p, elementType);
-  if (addressSpace != 0) {
-    if (elementType)
-      p << ", ";
-    p << addressSpace;
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // custom<ExtTypeParams>
 //===----------------------------------------------------------------------===//
@@ -285,33 +251,6 @@ LLVMFunctionType::verify(function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// LLVMPointerType
-//===----------------------------------------------------------------------===//
-
-bool LLVMPointerType::isValidElementType(Type type) {
-  if (!type)
-    return true;
-  return isCompatibleOuterType(type)
-             ? !llvm::isa<LLVMVoidType, LLVMTokenType, LLVMMetadataType,
-                          LLVMLabelType>(type)
-             : llvm::isa<PointerElementTypeInterface>(type);
-}
-
-LLVMPointerType LLVMPointerType::get(Type pointee, unsigned addressSpace) {
-  assert(pointee && "expected non-null subtype, pass the context instead if "
-                    "the opaque pointer type is desired");
-  return Base::get(pointee.getContext(), pointee, addressSpace);
-}
-
-LogicalResult
-LLVMPointerType::verify(function_ref<InFlightDiagnostic()> emitError,
-                        Type pointee, unsigned) {
-  if (!isValidElementType(pointee))
-    return emitError() << "invalid pointer element type: " << pointee;
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // DataLayoutTypeInterface
 
@@ -369,9 +308,7 @@ LLVMPointerType::getTypeSizeInBits(const DataLayout &dataLayout,
 
   // For other memory spaces, use the size of the pointer to the default memory
   // space.
-  if (isOpaque())
-    return dataLayout.getTypeSizeInBits(get(getContext()));
-  return dataLayout.getTypeSizeInBits(get(getElementType()));
+  return dataLayout.getTypeSizeInBits(get(getContext()));
 }
 
 unsigned LLVMPointerType::getABIAlignment(const DataLayout &dataLayout,
@@ -380,9 +317,7 @@ unsigned LLVMPointerType::getABIAlignment(const DataLayout &dataLayout,
           getPointerDataLayoutEntry(params, *this, PtrDLEntryPos::Abi))
     return *alignment;
 
-  if (isOpaque())
-    return dataLayout.getTypeABIAlignment(get(getContext()));
-  return dataLayout.getTypeABIAlignment(get(getElementType()));
+  return dataLayout.getTypeABIAlignment(get(getContext()));
 }
 
 unsigned
@@ -392,9 +327,7 @@ LLVMPointerType::getPreferredAlignment(const DataLayout &dataLayout,
           getPointerDataLayoutEntry(params, *this, PtrDLEntryPos::Preferred))
     return *alignment;
 
-  if (isOpaque())
-    return dataLayout.getTypePreferredAlignment(get(getContext()));
-  return dataLayout.getTypePreferredAlignment(get(getElementType()));
+  return dataLayout.getTypePreferredAlignment(get(getContext()));
 }
 
 bool LLVMPointerType::areCompatible(DataLayoutEntryListRef oldLayout,
@@ -440,7 +373,6 @@ LogicalResult LLVMPointerType::verifyEntries(DataLayoutEntryListRef entries,
   for (DataLayoutEntryInterface entry : entries) {
     if (!entry.isTypeEntry())
       continue;
-    auto key = llvm::cast<LLVMPointerType>(entry.getKey().get<Type>());
     auto values = llvm::dyn_cast<DenseIntElementsAttr>(entry.getValue());
     if (!values || (values.size() != 3 && values.size() != 4)) {
       return emitError(loc)
@@ -448,10 +380,6 @@ LogicalResult LLVMPointerType::verifyEntries(DataLayoutEntryListRef entries,
              << " to be a dense integer elements attribute with 3 or 4 "
                 "elements";
     }
-    if (key.getElementType() && !key.getElementType().isInteger(8)) {
-      return emitError(loc) << "unexpected layout attribute for pointer to "
-                            << key.getElementType();
-    }
     if (extractPointerSpecValue(values, PtrDLEntryPos::Abi) >
         extractPointerSpecValue(values, PtrDLEntryPos::Preferred)) {
       return emitError(loc) << "preferred alignment is expected to be at least "
@@ -869,11 +797,7 @@ static bool isCompatibleImpl(Type type, DenseSet<Type> &compatibleTypes) {
             return vecType.getRank() == 1 &&
                    isCompatible(vecType.getElementType());
           })
-          .Case<LLVMPointerType>([&](auto pointerType) {
-            if (pointerType.isOpaque())
-              return true;
-            return isCompatible(pointerType.getElementType());
-          })
+          .Case<LLVMPointerType>([&](auto pointerType) { return true; })
           .Case<LLVMTargetExtType>([&](auto extType) {
             return llvm::all_of(extType.getTypeParams(), isCompatible);
           })
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp
index 9731689e551762..b094c650ff1932 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp
@@ -92,10 +92,6 @@ LogicalResult AddFieldGetterToStructDirectUse<LoadOp>::matchAndRewrite(
     LoadOp load, PatternRewriter &rewriter) const {
   PatternRewriter::InsertionGuard guard(rewriter);
 
-  // Load from typed pointers are not supported.
-  if (!load.getAddr().getType().isOpaque())
-    return failure();
-
   Type inconsistentElementType =
       isElementTypeInconsistent(load.getAddr(), load.getType());
   if (!inconsistentElementType)
@@ -129,10 +125,6 @@ LogicalResult AddFieldGetterToStructDirectUse<StoreOp>::matchAndRewrite(
     StoreOp store, PatternRewriter &rewriter) const {
   PatternRewriter::InsertionGuard guard(rewriter);
 
-  // Store to typed pointers are not supported.
-  if (!store.getAddr().getType().isOpaque())
-    return failure();
-
   Type inconsistentElementType =
       isElementTypeInconsistent(store.getAddr(), store.getValue().getType());
   if (!inconsistentElementType)
@@ -172,9 +164,9 @@ static std::optional<uint64_t> gepToByteOffset(DataLayout &layout, GEPOp gep) {
     indices.push_back(indexInt.getInt());
   }
 
-  uint64_t offset = indices[0] * layout.getTypeSize(gep.getSourceElementType());
+  uint64_t offset = indices[0] * layout.getTypeSize(gep.getElemType());
 
-  Type currentType = gep.getSourceElementType();
+  Type currentType = gep.getElemType();
   for (uint32_t index : llvm::drop_begin(indices)) {
     bool shouldCancel =
         TypeSwitch<Type, bool>(currentType)
@@ -579,7 +571,7 @@ LogicalResult SplitStores::matchAndRewrite(StoreOp store,
         return failure();
 
       offset = *byteOffset;
-      typeHint = gepOp.getSourceElementType();
+      typeHint = gepOp.getElemType();
       address = gepOp.getBase();
     }
   }
@@ -661,8 +653,7 @@ LogicalResult SplitGEP::matchAndRewrite(GEPOp gepOp,
 
   // Split of the first GEP using the first two indices.
   auto subGepOp = rewriter.create<GEPOp>(
-      gepOp.getLoc(), gepOp.getType(), gepOp.getSourceElementType(),
-      gepOp.getBase(),
+      gepOp.getLoc(), gepOp.getType(), gepOp.getElemType(), gepOp.getBase(),
       llvm::map_to_vector(llvm::make_range(indices.begin(), splitIter),
                           indexToGEPArg),
       gepOp.getInbounds());
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index df64d561f46cb3..ff2eb9ebfc42f5 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -27,14 +27,19 @@ using namespace acc;
 #include "mlir/Dialect/OpenACC/OpenACCTypeInterfaces.cpp.inc"
 
 namespace {
-/// Model for pointer-like types that already provide a `getElementType` method.
-template <typename T>
-struct PointerLikeModel
-    : public PointerLikeType::ExternalModel<PointerLikeModel<T>, T> {
+struct MemRefPointerLikeModel
+    : public PointerLikeType::ExternalModel<MemRefPointerLikeModel,
+                                            MemRefType> {
   Type getElementType(Type pointer) const {
-    return llvm::cast<T>(pointer).getElementType();
+    return llvm::cast<MemRefType>(pointer).getElementType();
   }
 };
+
+struct LLVMPointerPointerLikeModel
+    : public PointerLikeType::ExternalModel<LLVMPointerPointerLikeModel,
+                                            LLVM::LLVMPointerType> {
+  Type getElementType(Type pointer) const { return Type(); }
+};
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -58,9 +63,9 @@ void OpenACCDialect::initialize() {
   // By attaching interfaces here, we make the OpenACC dialect dependent on
   // the other dialects. This is probably better than having dialects like LLVM
   // and memref be dependent on OpenACC.
-  LLVM::LLVMPointerType::attachInterface<
-      PointerLikeModel<LLVM::LLVMPointerType>>(*getContext());
-  MemRefType::attachInterface<PointerLikeModel<MemRefType>>(*getContext());
+  MemRefType::attachInterface<MemRefPointerLikeModel>(*getContext());
+  LLVM::LLVMPointerType::attachInterface<LLVMPointerPointerLikeModel>(
+      *getContext());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1023,17 +1028,13 @@ void EnterDataOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // AtomicReadOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult AtomicReadOp::verify() {
-  return verifyCommon();
-}
+LogicalResult AtomicReadOp::verify() { return verifyCommon(); }
 
 //===----------------------------------------------------------------------===//
 // AtomicWriteOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult AtomicWriteOp::verify() {
-  return verifyCommon();
-}
+LogicalResult AtomicWriteOp::verify() { return verifyCommon(); }
 
 //===----------------------------------------------------------------------===//
 // AtomicUpdateOp
@@ -1054,13 +1055,9 @@ LogicalResult AtomicUpdateOp::canonicalize(AtomicUpdateOp op,
   return failure();
 }
 
-LogicalResult AtomicUpdateOp::verify() {
-  return verifyCommon();
-}
+LogicalResult AtomicUpdateOp::verify() { return verifyCommon(); }
 
-LogicalResult AtomicUpdateOp::verifyRegions() {
-  return verifyRegionsCommon();
-}
+LogicalResult AtomicUpdateOp::verifyRegions() { return verifyRegionsCommon(); }
 
 //===----------------------------------------------------------------------===//
 // AtomicCaptureOp
@@ -1084,9 +1081,7 @@ AtomicUpdateOp AtomicCaptureOp::getAtomicUpdateOp() {
   return dyn_cast<AtomicUpdateOp>(getSecondOp());
 }
 
-LogicalResult AtomicCaptureOp::verifyRegions() {
-  return verifyRegionsCommon();
-}
+LogicalResult AtomicCaptureOp::verifyRegions() { return verifyRegionsCommon(); }
 
 //===----------------------------------------------------------------------===//
 // DeclareEnterOp
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index a56e75eae06bb0..480af0e1307c15 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -39,15 +39,20 @@ using namespace mlir;
 using namespace mlir::omp;
 
 namespace {
-/// Model for pointer-like types that already provide a `getElementType` method.
-template <typename T>
-struct PointerLikeModel
-    : public PointerLikeType::ExternalModel<PointerLikeModel<T>, T> {
+struct MemRefPointerLikeModel
+    : public PointerLikeType::ExternalModel<MemRefPointerLikeModel,
+                                            MemRefType> {
   Type getElementType(Type pointer) const {
-    return llvm::cast<T>(pointer).getElementType();
+    return llvm::cast<MemRefType>(pointer).getElementType();
   }
 };
 
+struct LLVMPointerPointerLikeModel
+    : public PointerLikeType::ExternalModel<LLVMPointerPointerLikeModel,
+                                            LLVM::LLVMPointerType> {
+  Type getElementType(Type pointer) const { return Type(); }
+};
+
 struct OpenMPDialectFoldInterface : public DialectFoldInterface {
   using DialectFoldInterface::DialectFoldInterface;
 
@@ -73,11 +78,9 @@ void OpenMPDialect::initialize() {
       >();
 
   addInterface<OpenMPDialectFoldInterface>();
-  LLVM::LLVMPointerType::attachInterface<
-      PointerLikeModel<LLVM::LLVMPointerType>>(*getContext());
-  MemRefType::attachInterface<PointerLikeModel<MemRefType>>(*getContext());
-  LLVM::LLVMPointerType::attachInterface<
-      PointerLikeModel<LLVM::LLVMPointerType>>(*getContext());
+  MemRefType::attachInterface<MemRefPointerLikeModel>(*getContext());
+  LLVM::LLVMPointerType::attachInterface<LLVMPointerPointerLikeModel>(
+      *getContext());
 
   // Attach default offload module interface to module op to access
   // offload functionality through
@@ -1415,9 +1418,7 @@ LogicalResult AtomicUpdateOp::verify() {
   return verifySynchronizationHint(*this, getHintVal());
 }
 
-LogicalResult AtomicUpdateOp::verifyRegions() {
-  return verifyRegionsCommon();
-}
+LogicalResult AtomicUpdateOp::verifyRegions() { return verifyRegionsCommon(); }
 
 //===----------------------------------------------------------------------===//
 // Verifier for AtomicCaptureOp
diff --git a/mlir/test/Dialect/LLVMIR/global.mlir b/mlir/test/Dialect/LLVMIR/global.mlir
index a33fff3967e4d5..81178b2ef901f4 100644
--- a/mlir/test/Dialect/LLVMIR/global.mlir
+++ b/mlir/test/Dialect/LLVMIR/global.mlir
@@ -117,11 +117,10 @@ llvm.mlir.global internal protected unnamed_addr @protected(42 : i32) : i32
 
 // -----
 
-// expected-error @+1 {{expects type to be a valid element type for an LLVM pointer}}
+// expected-error @+1 {{expects type to be a valid element type for an LLVM global}}
 llvm.mlir.global internal constant @constant(37.0) : !llvm.label
 
 // -----
-
 // expected-error @+1 {{'addr_space' failed to satisfy constraint: 32-bit signless integer attribute whose value is non-negative}}
 "llvm.mlir.global"() ({}) {sym_name = "foo", global_type = i64, value = 42 : i64, addr_space = -1 : i32, linkage = #llvm.linkage<private>} : () -> ()
 
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index fe2f94454561a0..d6960f00f0428c 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -90,30 +90,23 @@ func.func @alloca_non_integer_alignment() {
 
 // -----
 
-func.func @alloca_opaque_ptr_no_type(%sz : i64) {
-  // expected-error@below {{expected 'elem_type' attribute if opaque pointer type is used}}
-  "llvm.alloca"(%sz) : (i64) -> !llvm.ptr
-}
-
-// -----
-
 func.func @gep_missing_input_result_type(%pos : i64, %base : !llvm.ptr) {
   // expected-error@+1 {{2 operands present, but expected 0}}
-  llvm.getelementptr %base[%pos] : () -> ()
+  llvm.getelementptr %base[%pos] : () -> (), i64
 }
 
 // -----
 
 func.func @gep_missing_input_type(%pos : i64, %base : !llvm.ptr) {
   // expected-error@+1 {{2 operands present, but expected 0}}
-  llvm.getelementptr %base[%pos] : () -> (!llvm.ptr)
+  llvm.getelementptr %base[%pos] : () -> (!llvm.ptr), i64
 }
 
 // -----
 
 func.func @gep_missing_result_type(%pos : i64, %base : !llvm.ptr) {
   // expected-error@+1 {{op requires one result}}
-  llvm.getelementptr %base[%pos] : (!llvm.ptr, i64) -> ()
+  llvm.getelementptr %base[%pos] : (!llvm.ptr, i64) -> (), i64
 }
 
 // -----
@@ -133,15 +126,8 @@ func.func @gep_too_few_dynamic(%base : !llvm.ptr) {
 // -----
 
 func.func @load_non_llvm_type(%foo : memref<f32>) {
-  // expected-error@+1 {{expected LLVM pointer type}}
-  llvm.load %foo : memref<f32>
-}
-
-// -----
-
-func.func @load_non_ptr_type(%foo : f32) {
-  // expected-error@+1 {{expected LLVM pointer type}}
-  llvm.load %foo : f32
+  // expected-error@+1 {{op operand #0 must be LLVM pointer type}}
+  llvm.load %foo : memref<f32> -> f32
 }
 
 // -----
@@ -181,27 +167,6 @@ func.func @load_unaligned_atomic(%ptr : !llvm.ptr) {
 
 // -----
 
-func.func @store_non_llvm_type(%foo : memref<f32>, %bar : f32) {
-  // expected-error@+1 {{expected LLVM pointer type}}
-  llvm.store %bar, %foo : memref<f32>
-}
-
-// -----
-
-func.func @store_non_ptr_type(%foo : f32, %bar : f32) {
-  // expected-error@+1 {{expected LLVM pointer type}}
-  llvm.store %bar, %foo : f32
-}
-
-// -----
-
-func.func @store_malformed_elem_type(%foo: !llvm.ptr, %bar: f32) {
-  // expected-error@+1 {{expected non-function type}}
-  llvm.store %bar, %foo : !llvm.ptr, "f32"
-}
-
-// -----
-
 func.func @store_syncscope(%val : f32, %ptr : !llvm.ptr) {
   // expected-error@below {{expected syncscope to be null for non-atomic access}}
   "llvm.store"(%val, %ptr) {syncscope = "singlethread"} : (f32, !llvm.ptr) -> ()
@@ -632,14 +597,6 @@ func.func @nvvm_invalid_mma_8(%a0 : i32, %a1 : i32,
 
 // -----
 
-func.func @atomicrmw_expected_ptr(%f32 : f32) {
-  // expected-error@+1 {{operand #0 must be LLVM pointer to floating point LLVM type or LLVM pointer type or integer}}
-  %0 = "llvm.atomicrmw"(%f32, %f32) {bin_op=11, ordering=1} : (f32, f32) -> f32
-  llvm.return
-}
-
-// -----
-
 func.func @atomicrmw_mismatched_operands(%f32_ptr : !llvm.ptr, %f32 : f32) {
   // expected-error@+1 {{op failed to verify that result #0 and operand #1 have the same type}}
   %0 = "llvm.atomicrmw"(%f32_ptr, %f32) {bin_op=11, ordering=1} : (!llvm.ptr, f32) -> i32
@@ -672,14 +629,6 @@ func.func @atomicrmw_expected_int(%f32_ptr : !llvm.ptr, %f32 : f32) {
 
 // -----
 
-func.func @cmpxchg_expected_ptr(%f32 : f32) {
-  // expected-error@+1 {{op operand #0 must be LLVM pointer to integer or LLVM pointer type}}
-  %0 = "llvm.cmpxchg"(%f32, %f32, %f32) {success_ordering=2,failure_ordering=2} : (f32, f32, f32) -> !llvm.struct<(f32, i1)>
-  llvm.return
-}
-
-// -----
-
 func.func @cmpxchg_mismatched_value_operands(%ptr : !llvm.ptr, %i32 : i32, %i64 : i64) {
   // expected-error@+1 {{op failed to verify that operand #1 and operand #2 have the same type}}
   %0 = "llvm.cmpxchg"(%ptr, %i32, %i64) {success_ordering=2,failure_ordering=2} : (!llvm.ptr, i32, i64) -> !llvm.struct<(i32, i1)>
diff --git a/mlir/unittests/Dialect/LLVMIR/LLVMTypeTest.cpp b/mlir/unittests/Dialect/LLVMIR/LLVMTypeTest.cpp
index 2d9f8d0e607499..083dec819a0e05 100644
--- a/mlir/unittests/Dialect/LLVMIR/LLVMTypeTest.cpp
+++ b/mlir/unittests/Dialect/LLVMIR/LLVMTypeTest.cpp
@@ -17,37 +17,3 @@ TEST_F(LLVMIRTest, IsStructTypeMutable) {
   ASSERT_TRUE(bool(structTy));
   ASSERT_TRUE(structTy.hasTrait<TypeTrait::IsMutable>());
 }
-
-TEST_F(LLVMIRTest, MutualReferencedSubElementTypes) {
-  auto fooStructTy = LLVMStructType::getIdentified(&context, "foo");
-  ASSERT_TRUE(bool(fooStructTy));
-  auto barStructTy = LLVMStructType::getIdentified(&context, "bar");
-  ASSERT_TRUE(bool(barStructTy));
-
-  // Created two structs that are referencing each other.
-  Type fooBody[] = {LLVMPointerType::get(barStructTy)};
-  ASSERT_TRUE(succeeded(fooStructTy.setBody(fooBody, /*isPacked=*/false)));
-  Type barBody[] = {LLVMPointerType::get(fooStructTy)};
-  ASSERT_TRUE(succeeded(barStructTy.setBody(barBody, /*isPacked=*/false)));
-
-  // Test if walkSubElements goes into infinite loops.
-  SmallVector<Type, 4> subElementTypes;
-  fooStructTy.walk([&](Type type) { subElementTypes.push_back(type); });
-  ASSERT_EQ(subElementTypes.size(), 4U);
-
-  // !llvm.ptr
-  ASSERT_TRUE(isa<LLVMPointerType>(subElementTypes[0]));
-
-  // !llvm.struct<"bar",...>
-  auto structType = dyn_cast<LLVMStructType>(subElementTypes[1]);
-  ASSERT_TRUE(bool(structType));
-  ASSERT_TRUE(structType.getName().equals("bar"));
-
-  // !llvm.ptr
-  ASSERT_TRUE(isa<LLVMPointerType>(subElementTypes[2]));
-
-  // !llvm.struct<"foo",...>
-  structType = dyn_cast<LLVMStructType>(subElementTypes[3]);
-  ASSERT_TRUE(bool(structType));
-  ASSERT_TRUE(structType.getName().equals("foo"));
-}

From 1a1a5ec7566c23f080530e45e29d6cffa33b6020 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Nov 2023 15:54:31 +0100
Subject: [PATCH 31/40] [SDAG] Avoid use of ConstantExpr::getFPTrunc() (NFC)

Use the constant folding API instead. As we're working on
ConstantFP, it is guaranteed to succeed.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 82751a442dbc3b..ad5a4506efbd82 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -324,7 +325,8 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
           TLI.isLoadExtLegal(ISD::EXTLOAD, OrigVT, SVT) &&
           TLI.ShouldShrinkFPConstant(OrigVT)) {
         Type *SType = SVT.getTypeForEVT(*DAG.getContext());
-        LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
+        LLVMC = cast<ConstantFP>(ConstantFoldCastOperand(
+            Instruction::FPTrunc, LLVMC, SType, DAG.getDataLayout()));
         VT = SVT;
         Extend = true;
       }

From 12756831e4a96aed78e8724bb982e47444ff2ffc Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Mon, 6 Nov 2023 15:01:57 +0000
Subject: [PATCH 32/40] [Flang][OpenMP] Mark mergeable and untied clauses as
 unsupported (#71209)

These two clauses are not supported in the OpenMP Dialect to LLVM
conversion. Mark as TODO till support is added.
---
 flang/lib/Lower/OpenMP.cpp                    |  2 --
 flang/test/Lower/OpenMP/FIR/task.f90          | 32 ++-----------------
 .../test/Lower/OpenMP/Todo/task_mergeable.f90 | 13 ++++++++
 flang/test/Lower/OpenMP/Todo/task_untied.f90  | 13 ++++++++
 flang/test/Lower/OpenMP/task.f90              | 32 ++-----------------
 5 files changed, 30 insertions(+), 62 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/Todo/task_mergeable.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/task_untied.f90

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 390bafb49e5124..087f535ff06408 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -3092,8 +3092,6 @@ genOMP(Fortran::lower::AbstractConverter &converter,
         !std::get_if<Fortran::parser::OmpClause::Allocate>(&clause.u) &&
         !std::get_if<Fortran::parser::OmpClause::Default>(&clause.u) &&
         !std::get_if<Fortran::parser::OmpClause::Final>(&clause.u) &&
-        !std::get_if<Fortran::parser::OmpClause::Untied>(&clause.u) &&
-        !std::get_if<Fortran::parser::OmpClause::Mergeable>(&clause.u) &&
         !std::get_if<Fortran::parser::OmpClause::Priority>(&clause.u) &&
         !std::get_if<Fortran::parser::OmpClause::Reduction>(&clause.u) &&
         !std::get_if<Fortran::parser::OmpClause::Depend>(&clause.u) &&
diff --git a/flang/test/Lower/OpenMP/FIR/task.f90 b/flang/test/Lower/OpenMP/FIR/task.f90
index d7419bd1100e69..99a9e3a6b1e35a 100644
--- a/flang/test/Lower/OpenMP/FIR/task.f90
+++ b/flang/test/Lower/OpenMP/FIR/task.f90
@@ -40,34 +40,6 @@ subroutine omp_task_final(bar)
   !$omp end task
 end subroutine omp_task_final
 
-!===============================================================================
-! `untied` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_task_untied() {
-subroutine omp_task_untied()
-  !CHECK: omp.task untied {
-  !$omp task untied
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_untied
-
-!===============================================================================
-! `mergeable` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_task_mergeable() {
-subroutine omp_task_mergeable()
-  !CHECK: omp.task mergeable {
-  !$omp task mergeable
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_mergeable
-
 !===============================================================================
 ! `priority` clause
 !===============================================================================
@@ -245,8 +217,8 @@ subroutine task_multiple_clauses()
   integer :: x, y, z
   logical :: buzz
 
-  !CHECK: omp.task if(%{{.+}}) final(%{{.+}}) untied mergeable priority(%{{.+}}) allocate(%{{.+}} : i32 -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task if(buzz) final(buzz) untied mergeable priority(z) allocate(omp_high_bw_mem_alloc: x) private(x) firstprivate(y)
+  !CHECK: omp.task if(%{{.+}}) final(%{{.+}}) priority(%{{.+}}) allocate(%{{.+}} : i32 -> %{{.+}} : !fir.ref<i32>) {
+  !$omp task if(buzz) final(buzz) priority(z) allocate(omp_high_bw_mem_alloc: x) private(x) firstprivate(y)
 
   !CHECK: %[[x_priv:.+]] = fir.alloca i32
   !CHECK: %[[y_priv:.+]] = fir.alloca i32
diff --git a/flang/test/Lower/OpenMP/Todo/task_mergeable.f90 b/flang/test/Lower/OpenMP/Todo/task_mergeable.f90
new file mode 100644
index 00000000000000..13145d92ccf902
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/task_mergeable.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!===============================================================================
+! `mergeable` clause
+!===============================================================================
+
+! CHECK: not yet implemented: OpenMP Block construct clause
+subroutine omp_task_mergeable()
+  !$omp task mergeable
+  call foo()
+  !$omp end task
+end subroutine omp_task_mergeable
diff --git a/flang/test/Lower/OpenMP/Todo/task_untied.f90 b/flang/test/Lower/OpenMP/Todo/task_untied.f90
new file mode 100644
index 00000000000000..19621c7aac16d6
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/task_untied.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!===============================================================================
+! `untied` clause
+!===============================================================================
+
+! CHECK: not yet implemented: OpenMP Block construct clause
+subroutine omp_task_untied()
+  !$omp task untied
+  call foo()
+  !$omp end task
+end subroutine omp_task_untied
diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90
index 99b1740ca75a81..7d7a79af3185f5 100644
--- a/flang/test/Lower/OpenMP/task.f90
+++ b/flang/test/Lower/OpenMP/task.f90
@@ -40,34 +40,6 @@ subroutine omp_task_final(bar)
   !$omp end task
 end subroutine omp_task_final
 
-!===============================================================================
-! `untied` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_task_untied() {
-subroutine omp_task_untied()
-  !CHECK: omp.task untied {
-  !$omp task untied
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_untied
-
-!===============================================================================
-! `mergeable` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_task_mergeable() {
-subroutine omp_task_mergeable()
-  !CHECK: omp.task mergeable {
-  !$omp task mergeable
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_mergeable
-
 !===============================================================================
 ! `priority` clause
 !===============================================================================
@@ -253,8 +225,8 @@ subroutine task_multiple_clauses()
   integer :: x, y, z
   logical :: buzz
 
-  !CHECK: omp.task if(%{{.+}}) final(%{{.+}}) untied mergeable priority(%{{.+}}) allocate(%{{.+}} : i32 -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task if(buzz) final(buzz) untied mergeable priority(z) allocate(omp_high_bw_mem_alloc: x) private(x) firstprivate(y)
+  !CHECK: omp.task if(%{{.+}}) final(%{{.+}}) priority(%{{.+}}) allocate(%{{.+}} : i32 -> %{{.+}} : !fir.ref<i32>) {
+  !$omp task if(buzz) final(buzz) priority(z) allocate(omp_high_bw_mem_alloc: x) private(x) firstprivate(y)
 
 !CHECK: %[[X_PRIV_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFtask_multiple_clausesEx"}
 !CHECK: %[[X_PRIV:.+]]:2 = hlfir.declare %[[X_PRIV_ALLOCA]] {uniq_name = "_QFtask_multiple_clausesEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)

From 3a12f355ab418baff9fd48a541ab86128960d3cb Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Nov 2023 15:55:59 +0100
Subject: [PATCH 33/40] [IR] Mark FP casts constant exprs as undesirable

We will no longer generate these constant expressions by default,
but they can still be explicitly created.
---
 llvm/lib/IR/Constants.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index cca481181068d7..16072d2efc95bf 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -2270,14 +2270,14 @@ bool ConstantExpr::isDesirableCastOp(unsigned Opcode) {
   switch (Opcode) {
   case Instruction::ZExt:
   case Instruction::SExt:
-    return false;
-  case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::FPExt:
   case Instruction::UIToFP:
   case Instruction::SIToFP:
   case Instruction::FPToUI:
   case Instruction::FPToSI:
+    return false;
+  case Instruction::Trunc:
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
   case Instruction::BitCast:

From 4a8b0ea82495488e36da36e7d7f2eb14eda16457 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 6 Nov 2023 15:08:15 +0000
Subject: [PATCH 34/40] [DebugInfo][Verifier] Verify that array types have base
 types present (#70803)

The base-type field of a DICompositeType is optional in the IR syntax,
however it makes no sense to have an array of an unspecified type. Such
debug-info would be meaningless, and the added test crashes otherwise
(see #70787). Thus, add a verifier check to reject such ill-formed debug
info metadata. Test produced by Christian Ulmann.

fixes #70787
---
 llvm/lib/IR/Verifier.cpp                      |  4 +++
 .../DebugInfo/Generic/arrays-need-types.ll    | 27 +++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 llvm/test/DebugInfo/Generic/arrays-need-types.ll

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index d3db7a16dc0d60..24dbd65d9abca2 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1287,6 +1287,10 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
     CheckDI(N.getTag() == dwarf::DW_TAG_array_type,
             "rank can only appear in array type");
   }
+
+  if (N.getTag() == dwarf::DW_TAG_array_type) {
+    CheckDI(N.getRawBaseType(), "array types must have a base type", &N);
+  }
 }
 
 void Verifier::visitDISubroutineType(const DISubroutineType &N) {
diff --git a/llvm/test/DebugInfo/Generic/arrays-need-types.ll b/llvm/test/DebugInfo/Generic/arrays-need-types.ll
new file mode 100644
index 00000000000000..a1b7c963d38443
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/arrays-need-types.ll
@@ -0,0 +1,27 @@
+; RUN: opt %s -o - -S --passes=verify 2>&1 | FileCheck %s
+
+; CHECK:      array types must have a base type
+; CHECK-NEXT: !DICompositeType(tag: DW_TAG_array_type,
+; CHECK-NEXT: warning: ignoring invalid debug info
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+define i32 @func(ptr %0) !dbg !3 {
+  call void @llvm.dbg.value(metadata ptr %0, metadata !6, metadata !DIExpression()), !dbg !10
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "file.c", directory: "/")
+!3 = distinct !DISubprogram(name: "func", scope: !2, file: !2, line: 46, type: !4, scopeLine: 48, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!4 = distinct !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocalVariable(name: "op", arg: 5, scope: !3, file: !2, line: 47, type: !7)
+!7 = !DICompositeType(tag: DW_TAG_array_type, size: 2624, elements: !8)
+!8 = !{!9}
+!9 = !DISubrange(count: 41)
+!10 = !DILocation(line: 0, scope: !3)

From 68b071d9a22796d2037eee74e49bbb9fe707e19b Mon Sep 17 00:00:00 2001
From: Simon Camphausen <simon.camphausen@iml.fraunhofer.de>
Date: Mon, 6 Nov 2023 16:17:20 +0100
Subject: [PATCH 35/40] [mlir][emitc] Fix corner case in translation of literal
 ops (#71375)

Fix a corner case missed in #71296 when operands generated by literals
are mixed with the args attribute of a call op.

Additionally remove a range check that is already handled by the CallOp
verifier.
---
 mlir/lib/Target/Cpp/TranslateToCpp.cpp         |  9 +++++----
 mlir/test/Target/Cpp/literal_call_operand.mlir | 12 ++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 6c95eb3d20dacd..291624c5480318 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -425,12 +425,13 @@ static LogicalResult printOperation(CppEmitter &emitter, emitc::CallOp callOp) {
       // Index attributes are treated specially as operand index.
       if (t.getType().isIndex()) {
         int64_t idx = t.getInt();
-        if ((idx < 0) || (idx >= op.getNumOperands()))
-          return op.emitOpError("invalid operand index");
-        if (!emitter.hasValueInScope(op.getOperand(idx)))
+        Value operand = op.getOperand(idx);
+        auto literalDef =
+            dyn_cast_if_present<LiteralOp>(operand.getDefiningOp());
+        if (!literalDef && !emitter.hasValueInScope(operand))
           return op.emitOpError("operand ")
                  << idx << "'s value not defined in scope";
-        os << emitter.getOrCreateName(op.getOperand(idx));
+        os << emitter.getOrCreateName(operand);
         return success();
       }
     }
diff --git a/mlir/test/Target/Cpp/literal_call_operand.mlir b/mlir/test/Target/Cpp/literal_call_operand.mlir
index 017b4d53c43e3d..428b66bb2519d9 100644
--- a/mlir/test/Target/Cpp/literal_call_operand.mlir
+++ b/mlir/test/Target/Cpp/literal_call_operand.mlir
@@ -12,3 +12,15 @@ func.func @emitc_call_operand() {
 // CPP-DECLTOP: void emitc_call_operand() {
 // CPP-DECLTOP-NEXT: float v1;
 // CPP-DECLTOP-NEXT: v1 = foo(M_PI);
+
+func.func @emitc_call_operand_arg() {
+  %p0 = emitc.literal "M_PI" : f32
+  %1 = emitc.call "bar"(%p0) {args = [42 : i32, 0 : index]} : (f32) -> f32
+  return
+}
+// CPP-DEFAULT: void emitc_call_operand_arg() {
+// CPP-DEFAULT-NEXT: float v1 = bar(42, M_PI);
+
+// CPP-DECLTOP: void emitc_call_operand_arg() {
+// CPP-DECLTOP-NEXT: float v1;
+// CPP-DECLTOP-NEXT: v1 = bar(42, M_PI);

From 1950190b61e1087219a6c5157e742517b8023b76 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 6 Nov 2023 15:19:57 +0000
Subject: [PATCH 36/40] [DebugInfo] Use StringRef::starts_with/ends_with
 instead of startswith/endswith. NFC.

startswith/endswith wrap starts_with/ends_with and will eventually go away (to more closely match string_view)
---
 llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp        | 2 +-
 llvm/lib/DebugInfo/DWARF/DWARFContext.cpp                 | 2 +-
 llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp             | 8 ++++----
 llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp              | 2 +-
 .../DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp    | 2 +-
 llvm/lib/DebugInfo/PDB/Native/InputFile.cpp               | 2 +-
 llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp           | 2 +-
 llvm/lib/DebugInfo/PDB/Native/TpiHashing.cpp              | 2 +-
 llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp                | 2 +-
 llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp             | 4 ++--
 llvm/lib/DebugInfo/Symbolize/Symbolize.cpp                | 2 +-
 11 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 126725f229c545..8302cbbf231aed 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -1014,7 +1014,7 @@ std::optional<StringRef> llvm::StripTemplateParameters(StringRef Name) {
   //
   // We look for > at the end but if it does not contain any < then we
   // have something like operator>>. We check for the operator<=> case.
-  if (!Name.endswith(">") || Name.count("<") == 0 || Name.endswith("<=>"))
+  if (!Name.ends_with(">") || Name.count("<") == 0 || Name.ends_with("<=>"))
     return {};
 
   // How many < until we have the start of the template parameters.
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 57ca11a077a486..088dffeaa2b9f6 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -2204,7 +2204,7 @@ class DWARFObjInMemory final : public DWARFObject {
         continue;
 
       if (!Section.relocations().empty() && Name.ends_with(".dwo") &&
-          RelSecName.startswith(".debug")) {
+          RelSecName.starts_with(".debug")) {
         HandleWarning(createError("unexpected relocations for dwo section '" +
                                   RelSecName + "'"));
       }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp
index 5a5ac28f182211..20242d958b6b42 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFTypePrinter.cpp
@@ -8,7 +8,7 @@ void DWARFTypePrinter::appendTypeTagName(dwarf::Tag T) {
   StringRef TagStr = TagString(T);
   static constexpr StringRef Prefix = "DW_TAG_";
   static constexpr StringRef Suffix = "_type";
-  if (!TagStr.startswith(Prefix) || !TagStr.endswith(Suffix))
+  if (!TagStr.starts_with(Prefix) || !TagStr.ends_with(Suffix))
     return;
   OS << TagStr.substr(Prefix.size(),
                       TagStr.size() - (Prefix.size() + Suffix.size()))
@@ -181,7 +181,7 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D,
     Word = true;
     StringRef Name = NamePtr;
     static constexpr StringRef MangledPrefix = "_STN|";
-    if (Name.startswith(MangledPrefix)) {
+    if (Name.starts_with(MangledPrefix)) {
       Name = Name.drop_front(MangledPrefix.size());
       auto Separator = Name.find('|');
       assert(Separator != StringRef::npos);
@@ -191,12 +191,12 @@ DWARFTypePrinter::appendUnqualifiedNameBefore(DWARFDie D,
         *OriginalFullName = (BaseName + TemplateArgs).str();
       Name = BaseName;
     } else
-      EndedWithTemplate = Name.endswith(">");
+      EndedWithTemplate = Name.ends_with(">");
     OS << Name;
     // This check would be insufficient for operator overloads like
     // "operator>>" - but for now Clang doesn't try to simplify them, so this
     // is OK. Add more nuanced operator overload handling here if/when needed.
-    if (Name.endswith(">"))
+    if (Name.ends_with(">"))
       break;
     if (!appendTemplateParameters(D))
       break;
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index d720c1e3349551..7809fd65bfc8f9 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -156,7 +156,7 @@ getQualifiedNameIndex(DWARFDie &Die, uint64_t Language, GsymCreator &Gsym) {
   // Some GCC optimizations create functions with names ending with .isra.<num>
   // or .part.<num> and those names are just DW_AT_name, not DW_AT_linkage_name
   // If it looks like it could be the case, don't add any prefix
-  if (ShortName.startswith("_Z") &&
+  if (ShortName.starts_with("_Z") &&
       (ShortName.contains(".isra.") || ShortName.contains(".part.")))
     return Gsym.insertString(ShortName, /* Copy */ false);
 
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
index d1789fe587f3a5..1f6724988ae979 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
@@ -221,7 +221,7 @@ bool LVCodeViewReader::isSystemEntry(LVElement *Element, StringRef Name) const {
     return StringRef::npos != Name.find(String);
   };
   auto Starts = [=](const char *Pattern) -> bool {
-    return Name.startswith(Pattern);
+    return Name.starts_with(Pattern);
   };
   auto CheckExclude = [&]() -> bool {
     if (Starts("__") || Starts("_PMD") || Starts("_PMFN"))
diff --git a/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp b/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp
index cddee3e1c273f5..328d0f5ab060fe 100644
--- a/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp
@@ -561,7 +561,7 @@ static bool isMyCode(const SymbolGroup &Group) {
     return true;
 
   StringRef Name = Group.name();
-  if (Name.startswith("Import:"))
+  if (Name.starts_with("Import:"))
     return false;
   if (Name.ends_with_insensitive(".dll"))
     return false;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
index fb0ca2f9cf229c..d5cac33d15193e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -176,7 +176,7 @@ NativeSession::searchForPdb(const PdbSearchOptions &Opts) {
   if (!PathOrErr)
     return PathOrErr.takeError();
   StringRef PathFromExe = PathOrErr.get();
-  sys::path::Style Style = PathFromExe.startswith("/")
+  sys::path::Style Style = PathFromExe.starts_with("/")
                                ? sys::path::Style::posix
                                : sys::path::Style::windows;
   StringRef PdbName = sys::path::filename(PathFromExe, Style);
diff --git a/llvm/lib/DebugInfo/PDB/Native/TpiHashing.cpp b/llvm/lib/DebugInfo/PDB/Native/TpiHashing.cpp
index b71b2b15814419..941ce78027a213 100644
--- a/llvm/lib/DebugInfo/PDB/Native/TpiHashing.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/TpiHashing.cpp
@@ -19,7 +19,7 @@ using namespace llvm::pdb;
 // Corresponds to `fUDTAnon`.
 static bool isAnonymous(StringRef Name) {
   return Name == "<unnamed-tag>" || Name == "__unnamed" ||
-         Name.endswith("::<unnamed-tag>") || Name.endswith("::__unnamed");
+         Name.ends_with("::<unnamed-tag>") || Name.ends_with("::__unnamed");
 }
 
 // Computes the hash for a user-defined type record. This could be a struct,
diff --git a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
index d7b33ce1d0f062..716312f26e0bac 100644
--- a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -90,7 +90,7 @@ class SourceCode {
       size_t PosEnd = PrunedSource->find('\n', Pos);
       StringRef String = PrunedSource->substr(
           Pos, (PosEnd == StringRef::npos) ? StringRef::npos : (PosEnd - Pos));
-      if (String.endswith("\r"))
+      if (String.ends_with("\r"))
         String = String.drop_back(1);
       OS << format_decimal(L, MaxLineNumberWidth);
       if (L == Line)
diff --git a/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp b/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp
index a2bc2577b70acb..f7503ef49693d8 100644
--- a/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp
@@ -552,7 +552,7 @@ std::optional<uint64_t> MarkupFilter::parseAddr(StringRef Str) const {
   }
   if (all_of(Str, [](char C) { return C == '0'; }))
     return 0;
-  if (!Str.startswith("0x")) {
+  if (!Str.starts_with("0x")) {
     reportTypeError(Str, "address");
     return std::nullopt;
   }
@@ -741,7 +741,7 @@ uint64_t MarkupFilter::adjustAddr(uint64_t Addr, PCType Type) const {
 }
 
 StringRef MarkupFilter::lineEnding() const {
-  return Line.endswith("\r\n") ? "\r\n" : "\n";
+  return Line.ends_with("\r\n") ? "\r\n" : "\n";
 }
 
 bool MarkupFilter::MMap::contains(uint64_t Addr) const {
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 36d112a5f3fb29..15f2a6ece8b897 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -717,7 +717,7 @@ StringRef demanglePE32ExternCFunc(StringRef SymbolName) {
 
   // Remove any ending '@' for vectorcall.
   bool IsVectorCall = false;
-  if (HasAtNumSuffix && SymbolName.endswith("@")) {
+  if (HasAtNumSuffix && SymbolName.ends_with("@")) {
     SymbolName = SymbolName.drop_back();
     IsVectorCall = true;
   }

From ab97b89d03a7fd9b47d8ed073b01edd869421822 Mon Sep 17 00:00:00 2001
From: GoodDaisy <90915921+GoodDaisy@users.noreply.github.com>
Date: Mon, 6 Nov 2023 23:28:38 +0800
Subject: [PATCH 37/40] [llvm][docs]: fix typos (#71303)

---
 llvm/docs/CommandGuide/dsymutil.rst        | 2 +-
 llvm/docs/CommandGuide/llvm-remarkutil.rst | 4 ++--
 llvm/docs/LangRef.rst                      | 2 +-
 llvm/docs/LoopTerminology.rst              | 2 +-
 llvm/docs/ProgrammersManual.rst            | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst
index df621a429bb5c3..af9d7f16b36196 100644
--- a/llvm/docs/CommandGuide/dsymutil.rst
+++ b/llvm/docs/CommandGuide/dsymutil.rst
@@ -34,7 +34,7 @@ OPTIONS
 
 .. option:: --build-variant-suffix <suffix=buildvariant>
 
- Specify the build variant suffix used to build the executabe file.
+ Specify the build variant suffix used to build the executable file.
  There can be multiple variants for the binary of a product, each built
  slightly differently. The most common build variants are 'debug' and
  'profile'. Setting the DYLD_IMAGE_SUFFIX environment variable will
diff --git a/llvm/docs/CommandGuide/llvm-remarkutil.rst b/llvm/docs/CommandGuide/llvm-remarkutil.rst
index 6fd739e844c4a3..20f2b34ce00883 100644
--- a/llvm/docs/CommandGuide/llvm-remarkutil.rst
+++ b/llvm/docs/CommandGuide/llvm-remarkutil.rst
@@ -123,7 +123,7 @@ Summary
 ^^^^^^^
 
 :program:`llvm-remarkutil count` counts `remarks <https://llvm.org/docs/Remarks.html>` based on specified properties.
-By default the tool counts remarks based on how many occour in a source file or function or total for the generated remark file.
+By default the tool counts remarks based on how many occur in a source file or function or total for the generated remark file.
 The tool also supports collecting count based on specific remark arguments. The specified arguments should have an integer value to be able to report a count.
 
 The tool contains utilities to filter the remark count based on remark name, pass name, argument value and remark type.
@@ -149,7 +149,7 @@ OPTIONS
   * ``Total``: Report a count for the provided remark file.
 
 .. option:: --args[=arguments]
-  If `count-by` is set to `arg` this flag can be used to collect from specified remark arguments represented as a comma seperated string.
+  If `count-by` is set to `arg` this flag can be used to collect from specified remark arguments represented as a comma separated string.
   The arguments must have a numeral value to be able to count remarks by
 
 .. option:: --rargs[=arguments]
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 106dfab7409fa1..46a4b6d19e71b9 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -6135,7 +6135,7 @@ The current supported opcode vocabulary is limited:
   instruction.
 
   Because ``DW_OP_LLVM_entry_value`` is defined in terms of registers, it is
-  usually used in MIR, but it is also allowed in LLVM IR when targetting a
+  usually used in MIR, but it is also allowed in LLVM IR when targeting a
   :ref:`swiftasync <swiftasync>` argument. The operation is introduced by:
 
     - ``LiveDebugValues`` pass, which applies it to function parameters that
diff --git a/llvm/docs/LoopTerminology.rst b/llvm/docs/LoopTerminology.rst
index 995d3e10a0ae35..f15f9e682d7e51 100644
--- a/llvm/docs/LoopTerminology.rst
+++ b/llvm/docs/LoopTerminology.rst
@@ -158,7 +158,7 @@ a dominating header.
 
 * The `FixIrreducible <https://llvm.org/doxygen/FixIrreducible_8h.html>`_
   pass can transform irreducible control flow into loops by inserting
-  new loop headers. It is not inlcuded in any default optimization pass
+  new loop headers. It is not included in any default optimization pass
   pipeline, but is required for some back-end targets.
 
 
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index 5bc71bea77cdfc..551a23c0570e96 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -3561,8 +3561,8 @@ Important Public Members of the ``Module`` class
 * | ``Module::global_iterator`` - Typedef for global variable list iterator
   | ``Module::const_global_iterator`` - Typedef for const_iterator.
   | ``Module::insertGlobalVariable()`` - Inserts a global variable to the list.
-  | ``Module::removeGlobalVariable()`` - Removes a global variable frome the list.
-  | ``Module::eraseGlobalVariable()`` - Removes a global variable frome the list and deletes it.
+  | ``Module::removeGlobalVariable()`` - Removes a global variable from the list.
+  | ``Module::eraseGlobalVariable()`` - Removes a global variable from the list and deletes it.
   | ``global_begin()``, ``global_end()``, ``global_size()``, ``global_empty()``
 
   These are forwarding methods that make it easy to access the contents of a

From ac254fc055980219b30821c3717c6b7db0fbbc46 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 18 Apr 2023 04:16:58 -0700
Subject: [PATCH 38/40] [SLP]Improve tryToGatherExtractElements by using
 per-register analysis.

Currently tryToGatherExtractElements function analyzes the whole vector,
regrdless number of actual registers, used in this vector. It may
prevent some optimizations, because per-register analysis may allow to
simplify the final code by reusing more already emitted vectors and
better shuffles.

Differential Revision: https://reviews.llvm.org/D148855
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 542 ++++++++++--------
 .../AArch64/extractelements-to-shuffle.ll     | 135 ++---
 .../SLPVectorizer/X86/crash_clear_undefs.ll   |   2 +-
 .../SLPVectorizer/X86/hadd-inseltpoison.ll    | 152 ++++-
 .../test/Transforms/SLPVectorizer/X86/hadd.ll | 152 ++++-
 .../SLPVectorizer/X86/hsub-inseltpoison.ll    | 153 ++++-
 .../test/Transforms/SLPVectorizer/X86/hsub.ll | 153 ++++-
 .../X86/reused-extractelements.ll             |  23 +-
 8 files changed, 886 insertions(+), 426 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d9f016c98aeb17..bb233ed7d6c77c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -662,6 +662,36 @@ tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
   return Res;
 }
 
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+static SmallVector<std::optional<TTI::ShuffleKind>>
+tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+                           SmallVectorImpl<int> &Mask, unsigned NumParts) {
+  assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
+  SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
+  Mask.assign(VL.size(), PoisonMaskElem);
+  unsigned SliceSize = VL.size() / NumParts;
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    // Scan list of gathered scalars for extractelements that can be represented
+    // as shuffles.
+    MutableArrayRef<Value *> SubVL =
+        MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
+    SmallVector<int> SubMask;
+    std::optional<TTI::ShuffleKind> Res =
+        tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
+    ShufflesRes[Part] = Res;
+    copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
+  }
+  if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
+        return Res.has_value();
+      }))
+    ShufflesRes.clear();
+  return ShufflesRes;
+}
+
 namespace {
 
 /// Main data required for vectorization of instructions.
@@ -7152,101 +7182,80 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
   };
 
-  /// Compute the cost of creating a vector of type \p VecTy containing the
-  /// extracted values from \p VL.
-  InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
-                                     TTI::ShuffleKind ShuffleKind) {
-    unsigned NumElts = 0;
-    for (Value *V : VL) {
-      auto *EE = dyn_cast<ExtractElementInst>(V);
-      if (!EE)
-        continue;
-      auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
-      NumElts = std::max(NumElts, VecTy->getNumElements());
-    }
-    assert(NumElts > 0 &&
-           "Expected at least 1-element fixed length vector(s).");
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), NumElts);
-    unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
-    if (!NumOfParts || NumElts < NumOfParts)
-      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-    unsigned EltsPerVector = PowerOf2Ceil(divideCeil(NumElts, NumOfParts));
-    int ValNum = -1;
-    int ValIdx = -1;
-    // Check that if trying to permute 2 input vectors (which may result in
-    // several vector registers), each per-register subvector is the result of
-    // the permutation of 2 single registers.
-    if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc &&
-        !all_of(enumerate(Mask), [&](auto &&Arg) {
-          if (Arg.value() == PoisonMaskElem)
-            return true;
-          int CurValNum = (Arg.value() % NumElts) / EltsPerVector;
-          int CurValIdx = Arg.index() / EltsPerVector;
-          if (ValIdx != CurValIdx) {
-            ValIdx = CurValIdx;
-            ValNum = CurValNum;
-            return true;
-          }
-          return CurValNum == ValNum;
-        }))
-      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-
+  /// Compute the cost of creating a vector containing the extracted values from
+  /// \p VL.
+  InstructionCost
+  computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
+                     ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+                     unsigned NumParts) {
+    assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
+    unsigned NumElts =
+        std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
+          auto *EE = dyn_cast<ExtractElementInst>(V);
+          if (!EE)
+            return Sz;
+          auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
+          return std::max(Sz, VecTy->getNumElements());
+        });
+    unsigned NumSrcRegs = TTI.getNumberOfParts(
+        FixedVectorType::get(VL.front()->getType(), NumElts));
+    if (NumSrcRegs == 0)
+      NumSrcRegs = 1;
+    // FIXME: this must be moved to TTI for better estimation.
+    unsigned EltsPerVector = PowerOf2Ceil(std::max(
+        divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
+    auto CheckPerRegistersShuffle =
+        [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
+      DenseSet<int> RegIndices;
+      // Check that if trying to permute same single/2 input vectors.
+      TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
+      int FirstRegId = -1;
+      for (int &I : Mask) {
+        if (I == PoisonMaskElem)
+          continue;
+        int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
+        if (FirstRegId < 0)
+          FirstRegId = RegId;
+        RegIndices.insert(RegId);
+        if (RegIndices.size() > 2)
+          return std::nullopt;
+        if (RegIndices.size() == 2)
+          ShuffleKind = TTI::SK_PermuteTwoSrc;
+        I = (I % NumElts) % EltsPerVector +
+            (RegId == FirstRegId ? 0 : EltsPerVector);
+      }
+      return ShuffleKind;
+    };
     InstructionCost Cost = 0;
 
     // Process extracts in blocks of EltsPerVector to check if the source vector
     // operand can be re-used directly. If not, add the cost of creating a
     // shuffle to extract the values into a vector register.
-    auto *RegisterVecTy =
-        FixedVectorType::get(VL.front()->getType(), EltsPerVector);
-    SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);
-    TTI::ShuffleKind RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
-    Value *VecBase = nullptr;
-    bool IsIdentity = true;
-    for (auto [Idx, V] : enumerate(VL)) {
-      // Reached the start of a new vector registers.
-      if (Idx % EltsPerVector == 0) {
-        RegMask.assign(EltsPerVector, PoisonMaskElem);
-        RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
-        VecBase = nullptr;
-      }
-
-      // Need to exclude undefs from analysis.
-      if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      if (!ShuffleKinds[Part])
         continue;
-
-      // Check all extracts for a vector register on the target directly
-      // extract values in order.
-      unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
-      unsigned PrevIdx = CurrentIdx;
-      if (Idx % EltsPerVector != 0 && !isa<UndefValue>(VL[Idx - 1]) &&
-          Mask[Idx - 1] != PoisonMaskElem)
-        PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])) + 1;
-      if (!VecBase) {
-        VecBase = cast<ExtractElementInst>(V)->getVectorOperand();
-        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
-        IsIdentity = CurrentIdx % EltsPerVector == Idx % EltsPerVector;
-      } else if (VecBase != cast<ExtractElementInst>(V)->getVectorOperand()) {
-        IsIdentity = false;
-        RegisterSK = TargetTransformInfo::SK_PermuteTwoSrc;
-        RegMask[Idx % EltsPerVector] =
-            CurrentIdx % EltsPerVector + EltsPerVector;
-      } else {
-        IsIdentity &= PrevIdx == CurrentIdx &&
-                      CurrentIdx % EltsPerVector == Idx % EltsPerVector;
-        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
-      }
-
-      if (IsIdentity)
+      ArrayRef<int> MaskSlice =
+          Mask.slice(Part * EltsPerVector,
+                     (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
+                         ? Mask.size() % EltsPerVector
+                         : EltsPerVector);
+      SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
+      copy(MaskSlice, SubMask.begin());
+      std::optional<TTI::ShuffleKind> RegShuffleKind =
+          CheckPerRegistersShuffle(SubMask);
+      if (!RegShuffleKind) {
+        Cost += TTI.getShuffleCost(
+            *ShuffleKinds[Part],
+            FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
         continue;
-
-      // Skip all indices, except for the last index per vector block.
-      if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
-        continue;
-
-      // If we have a series of extracts which are not consecutive and hence
-      // cannot re-use the source vector register directly, compute the shuffle
-      // cost to extract the vector with EltsPerVector elements.
-      Cost += TTI.getShuffleCost(RegisterSK, RegisterVecTy, RegMask);
+      }
+      if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
+          !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
+        Cost += TTI.getShuffleCost(
+            *RegShuffleKind,
+            FixedVectorType::get(VL.front()->getType(), EltsPerVector),
+            SubMask);
+      }
     }
     return Cost;
   }
@@ -7464,90 +7473,76 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                        SmallPtrSetImpl<Value *> &CheckedExtracts)
       : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
         R(R), CheckedExtracts(CheckedExtracts) {}
-  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,
-                        TTI::ShuffleKind ShuffleKind) {
+  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+                        ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+                        unsigned NumParts) {
     if (Mask.empty())
       return nullptr;
     Value *VecBase = nullptr;
     ArrayRef<Value *> VL = E->Scalars;
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
     // If the resulting type is scalarized, do not adjust the cost.
-    unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
-    if (VecNumParts == VecTy->getNumElements())
+    if (NumParts == VL.size())
       return nullptr;
-    DenseMap<Value *, int> ExtractVectorsTys;
-    for (auto [I, V] : enumerate(VL)) {
-      // Ignore non-extractelement scalars.
-      if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))
-        continue;
-      // If all users of instruction are going to be vectorized and this
-      // instruction itself is not going to be vectorized, consider this
-      // instruction as dead and remove its cost from the final cost of the
-      // vectorized tree.
-      // Also, avoid adjusting the cost for extractelements with multiple uses
-      // in different graph entries.
-      const TreeEntry *VE = R.getTreeEntry(V);
-      if (!CheckedExtracts.insert(V).second ||
-          !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
-          (VE && VE != E))
-        continue;
-      auto *EE = cast<ExtractElementInst>(V);
-      VecBase = EE->getVectorOperand();
-      std::optional<unsigned> EEIdx = getExtractIndex(EE);
-      if (!EEIdx)
-        continue;
-      unsigned Idx = *EEIdx;
-      if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
-        auto It =
-            ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
-        It->getSecond() = std::min<int>(It->second, Idx);
-      }
-      // Take credit for instruction that will become dead.
-      if (EE->hasOneUse()) {
-        Instruction *Ext = EE->user_back();
-        if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
-              return isa<GetElementPtrInst>(U);
-            })) {
-          // Use getExtractWithExtendCost() to calculate the cost of
-          // extractelement/ext pair.
-          Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
-                                               EE->getVectorOperandType(), Idx);
-          // Add back the cost of s|zext which is subtracted separately.
-          Cost += TTI.getCastInstrCost(
-              Ext->getOpcode(), Ext->getType(), EE->getType(),
-              TTI::getCastContextHint(Ext), CostKind, Ext);
+    // Check if it can be considered reused if same extractelements were
+    // vectorized already.
+    bool PrevNodeFound = any_of(
+        ArrayRef(R.VectorizableTree).take_front(E->Idx),
+        [&](const std::unique_ptr<TreeEntry> &TE) {
+          return ((!TE->isAltShuffle() &&
+                   TE->getOpcode() == Instruction::ExtractElement) ||
+                  TE->State == TreeEntry::NeedToGather) &&
+                 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
+                   return VL.size() > Data.index() &&
+                          (Mask[Data.index()] == PoisonMaskElem ||
+                           isa<UndefValue>(VL[Data.index()]) ||
+                           Data.value() == VL[Data.index()]);
+                 });
+        });
+    unsigned SliceSize = VL.size() / NumParts;
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
+        // Ignore non-extractelement scalars.
+        if (isa<UndefValue>(V) ||
+            (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
           continue;
-        }
-      }
-      Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
-                                     Idx);
-    }
-    // Add a cost for subvector extracts/inserts if required.
-    for (const auto &Data : ExtractVectorsTys) {
-      auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
-      unsigned NumElts = VecTy->getNumElements();
-      if (Data.second % NumElts == 0)
-        continue;
-      if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
-        unsigned Idx = (Data.second / NumElts) * NumElts;
-        unsigned EENumElts = EEVTy->getNumElements();
-        if (Idx % NumElts == 0)
+        // If all users of instruction are going to be vectorized and this
+        // instruction itself is not going to be vectorized, consider this
+        // instruction as dead and remove its cost from the final cost of the
+        // vectorized tree.
+        // Also, avoid adjusting the cost for extractelements with multiple uses
+        // in different graph entries.
+        const TreeEntry *VE = R.getTreeEntry(V);
+        if (!CheckedExtracts.insert(V).second ||
+            !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
+            (VE && VE != E))
           continue;
-        if (Idx + NumElts <= EENumElts) {
-          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                     EEVTy, std::nullopt, CostKind, Idx, VecTy);
-        } else {
-          // Need to round up the subvector type vectorization factor to avoid a
-          // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
-          // <= EENumElts.
-          auto *SubVT =
-              FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
-          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                     EEVTy, std::nullopt, CostKind, Idx, SubVT);
+        auto *EE = cast<ExtractElementInst>(V);
+        VecBase = EE->getVectorOperand();
+        std::optional<unsigned> EEIdx = getExtractIndex(EE);
+        if (!EEIdx)
+          continue;
+        unsigned Idx = *EEIdx;
+        // Take credit for instruction that will become dead.
+        if (EE->hasOneUse() || !PrevNodeFound) {
+          Instruction *Ext = EE->user_back();
+          if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+                return isa<GetElementPtrInst>(U);
+              })) {
+            // Use getExtractWithExtendCost() to calculate the cost of
+            // extractelement/ext pair.
+            Cost -=
+                TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+                                             EE->getVectorOperandType(), Idx);
+            // Add back the cost of s|zext which is subtracted separately.
+            Cost += TTI.getCastInstrCost(
+                Ext->getOpcode(), Ext->getType(), EE->getType(),
+                TTI::getCastContextHint(Ext), CostKind, Ext);
+            continue;
+          }
         }
-      } else {
-        Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
-                                   VecTy, std::nullopt, CostKind, 0, EEVTy);
+        Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
+                                       CostKind, Idx);
       }
     }
     // Check that gather of extractelements can be represented as just a
@@ -7555,7 +7550,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // Found the bunch of extractelement instructions that must be gathered
     // into a vector and can be represented as a permutation elements in a
     // single input vector or of 2 input vectors.
-    Cost += computeExtractCost(VL, Mask, ShuffleKind);
+    // Done for reused if same extractelements were vectorized already.
+    if (!PrevNodeFound)
+      Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
     InVectors.assign(1, E);
     CommonMask.assign(Mask.begin(), Mask.end());
     transformMaskAfterShuffle(CommonMask, CommonMask);
@@ -7677,7 +7674,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     assert((IsFinalized || CommonMask.empty()) &&
            "Shuffle construction must be finalized.");
   }
-};
+  };
 
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
@@ -7738,40 +7735,41 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       reorderScalars(GatheredScalars, ReorderMask);
     SmallVector<int> Mask;
     SmallVector<int> ExtractMask;
-    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
     SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
     SmallVector<SmallVector<const TreeEntry *>> Entries;
+    SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
     // Check for gathered extracts.
-    ExtractShuffle =
-        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
-
     bool Resized = false;
     unsigned NumParts = TTI->getNumberOfParts(VecTy);
     if (NumParts == 0 || NumParts >= GatheredScalars.size())
       NumParts = 1;
-    if (Value *VecBase = Estimator.adjustExtracts(
-            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
-      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
-          Resized = true;
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
+    if (!all_of(GatheredScalars, UndefValue::classof)) {
+      ExtractShuffles =
+          tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+      if (!ExtractShuffles.empty()) {
+        if (Value *VecBase = Estimator.adjustExtracts(
+                E, ExtractMask, ExtractShuffles, NumParts)) {
+          if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+            if (VF == VecBaseTy->getNumElements() &&
+                GatheredScalars.size() != VF) {
+              Resized = true;
+              GatheredScalars.append(VF - GatheredScalars.size(),
+                                     PoisonValue::get(ScalarTy));
+            }
         }
-    } else if (ExtractShuffle &&
-               TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
-      copy(VL, GatheredScalars.begin());
-    }
+      }
 
-    // Do not try to look for reshuffled loads for gathered loads (they will be
-    // handled later), for vectorized scalars, and cases, which are definitely
-    // not profitable (splats and small gather nodes.)
-    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
-        E->isAltShuffle() ||
-        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
-        isSplat(E->Scalars) ||
-        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-      GatherShuffles =
-          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+      // Do not try to look for reshuffled loads for gathered loads (they will
+      // be handled later), for vectorized scalars, and cases, which are
+      // definitely not profitable (splats and small gather nodes.)
+      if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
+          E->isAltShuffle() ||
+          all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+          isSplat(E->Scalars) ||
+          (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
+        GatherShuffles =
+            isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+    }
     if (!GatherShuffles.empty()) {
       if (GatherShuffles.size() == 1 &&
           *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
@@ -10013,7 +10011,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
       : Builder(Builder), R(R) {}
 
   /// Adjusts extractelements after reusing them.
-  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
+  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+                        unsigned NumParts, bool &UseVecBaseAsInput) {
+    UseVecBaseAsInput = false;
+    SmallPtrSet<Value *, 4> UniqueBases;
     Value *VecBase = nullptr;
     for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
       int Idx = Mask[I];
@@ -10021,6 +10022,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
       VecBase = EI->getVectorOperand();
+      UniqueBases.insert(VecBase);
       // If the only one use is vectorized - can delete the extractelement
       // itself.
       if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
@@ -10029,7 +10031,82 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       R.eraseInstruction(EI);
     }
-    return VecBase;
+    if (NumParts == 1 || UniqueBases.size() == 1)
+      return VecBase;
+    UseVecBaseAsInput = true;
+    auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
+      for (auto [I, Idx] : enumerate(Mask))
+        if (Idx != PoisonMaskElem)
+          Idx = I;
+    };
+    // Perform multi-register vector shuffle, joining them into a single virtual
+    // long vector.
+    // Need to shuffle each part independently and then insert all this parts
+    // into a long virtual vector register, forming the original vector.
+    Value *Vec = nullptr;
+    SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+    unsigned SliceSize = E->Scalars.size() / NumParts;
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      ArrayRef<Value *> VL =
+          ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
+      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+      constexpr int MaxBases = 2;
+      SmallVector<Value *, MaxBases> Bases(MaxBases);
+#ifndef NDEBUG
+      int PrevSize = 0;
+#endif // NDEBUG
+      for (const auto [I, V]: enumerate(VL)) {
+        if (SubMask[I] == PoisonMaskElem)
+          continue;
+        Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
+        const int Size =
+            cast<FixedVectorType>(VecOp->getType())->getNumElements();
+#ifndef NDEBUG
+        assert((PrevSize == Size || PrevSize == 0) &&
+               "Expected vectors of the same size.");
+        PrevSize = Size;
+#endif // NDEBUG
+        Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
+      }
+      if (!Bases.front())
+        continue;
+      Value *SubVec;
+      if (Bases.back()) {
+        SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
+        TransformToIdentity(SubMask);
+      } else {
+        SubVec = Bases.front();
+      }
+      if (!Vec) {
+        Vec = SubVec;
+        assert((Part == 0 || all_of(seq<unsigned>(0, Part),
+                                    [&](unsigned P) {
+                                      ArrayRef<int> SubMask =
+                                          Mask.slice(P * SliceSize, SliceSize);
+                                      return all_of(SubMask, [](int Idx) {
+                                        return Idx == PoisonMaskElem;
+                                      });
+                                    })) &&
+               "Expected first part or all previous parts masked.");
+        copy(SubMask, VecMask.begin());
+      } else {
+        unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+        if (Vec->getType() != SubVec->getType()) {
+          unsigned SubVecVF =
+              cast<FixedVectorType>(SubVec->getType())->getNumElements();
+          VF = std::max(VF, SubVecVF);
+        }
+        // Adjust SubMask.
+        for (auto [I, Idx] : enumerate(SubMask))
+          if (Idx != PoisonMaskElem)
+            Idx += VF;
+        copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
+        Vec = createShuffle(Vec, SubVec, VecMask);
+        TransformToIdentity(VecMask);
+      }
+    }
+    copy(VecMask, Mask.begin());
+    return Vec;
   }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
@@ -10372,29 +10449,37 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   BVTy ShuffleBuilder(Params...);
   ResTy Res = ResTy();
   SmallVector<int> Mask;
-  SmallVector<int> ExtractMask;
-  std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
+  SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
+  SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
+  Value *ExtractVecBase = nullptr;
+  bool UseVecBaseAsInput = false;
   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
   SmallVector<SmallVector<const TreeEntry *>> Entries;
   Type *ScalarTy = GatheredScalars.front()->getType();
-  unsigned NumParts = TTI->getNumberOfParts(
-      FixedVectorType::get(ScalarTy, GatheredScalars.size()));
+  auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
+  unsigned NumParts = TTI->getNumberOfParts(VecTy);
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
     NumParts = 1;
   if (!all_of(GatheredScalars, UndefValue::classof)) {
     // Check for gathered extracts.
-    ExtractShuffle =
-        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
     bool Resized = false;
-    if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))
-      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
-          Resized = true;
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
-        }
+    ExtractShuffles =
+        tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+    if (!ExtractShuffles.empty()) {
+      if (Value *VecBase = ShuffleBuilder.adjustExtracts(
+              E, ExtractMask, NumParts, UseVecBaseAsInput)) {
+        ExtractVecBase = VecBase;
+        if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+          if (VF == VecBaseTy->getNumElements() &&
+              GatheredScalars.size() != VF) {
+            Resized = true;
+            GatheredScalars.append(VF - GatheredScalars.size(),
+                                   PoisonValue::get(ScalarTy));
+          }
+      }
+    }
     // Gather extracts after we check for full matched gathers only.
-    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
         E->isAltShuffle() ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
@@ -10545,30 +10630,35 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       }
     }
   };
-  if (ExtractShuffle || !GatherShuffles.empty()) {
+  if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
     bool IsNonPoisoned = true;
     bool IsUsedInExpr = true;
     Value *Vec1 = nullptr;
-    if (ExtractShuffle) {
+    if (!ExtractShuffles.empty()) {
       // Gather of extractelements can be represented as just a shuffle of
       // a single/two vectors the scalars are extracted from.
       // Find input vectors.
       Value *Vec2 = nullptr;
       for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
-        if (ExtractMask[I] == PoisonMaskElem ||
-            (!Mask.empty() && Mask[I] != PoisonMaskElem)) {
+        if (!Mask.empty() && Mask[I] != PoisonMaskElem)
           ExtractMask[I] = PoisonMaskElem;
-          continue;
-        }
-        if (isa<UndefValue>(E->Scalars[I]))
-          continue;
-        auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
-        if (!Vec1) {
-          Vec1 = EI->getVectorOperand();
-        } else if (Vec1 != EI->getVectorOperand()) {
-          assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
-                 "Expected only 1 or 2 vectors shuffle.");
-          Vec2 = EI->getVectorOperand();
+      }
+      if (UseVecBaseAsInput) {
+        Vec1 = ExtractVecBase;
+      } else {
+        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
+          if (ExtractMask[I] == PoisonMaskElem)
+            continue;
+          if (isa<UndefValue>(E->Scalars[I]))
+            continue;
+          auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+          if (!Vec1) {
+            Vec1 = EI->getVectorOperand();
+          } else if (Vec1 != EI->getVectorOperand()) {
+            assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+                   "Expected only 1 or 2 vectors shuffle.");
+            Vec2 = EI->getVectorOperand();
+          }
         }
       }
       if (Vec2) {
@@ -10629,10 +10719,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     int MSz = Mask.size();
     // Try to build constant vector and shuffle with it only if currently we
     // have a single permutation and more than 1 scalar constants.
-    bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
+    bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
     bool IsIdentityShuffle =
-        (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
-             TTI::SK_PermuteSingleSrc &&
+        ((UseVecBaseAsInput ||
+          all_of(ExtractShuffles,
+                 [](const std::optional<TTI::ShuffleKind> &SK) {
+                   return SK.value_or(TTI::SK_PermuteTwoSrc) ==
+                          TTI::SK_PermuteSingleSrc;
+                 })) &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
         (!GatherShuffles.empty() &&
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index e60e356e5cd819..8f76b2e54e6c2d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -75,64 +75,47 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP4TT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE45:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[VGETQ_LANE]], [[VGETQ_LANE45]]
-; CHECK-NEXT:    [[CONV48:%.*]] = trunc i64 [[ADD]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE51:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE55:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD57:%.*]] = add i64 [[VGETQ_LANE51]], [[VGETQ_LANE55]]
-; CHECK-NEXT:    [[CONV60:%.*]] = trunc i64 [[ADD57]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE63:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE67:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD69:%.*]] = add i64 [[VGETQ_LANE63]], [[VGETQ_LANE67]]
-; CHECK-NEXT:    [[CONV72:%.*]] = trunc i64 [[ADD69]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE75:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE79:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD81:%.*]] = add i64 [[VGETQ_LANE75]], [[VGETQ_LANE79]]
-; CHECK-NEXT:    [[CONV84:%.*]] = trunc i64 [[ADD81]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127
 ; CHECK-NEXT:    [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31
 ; CHECK-NEXT:    br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]]
 ; CHECK:       while.body88:
 ; CHECK-NEXT:    [[PA_ADDR_1291:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END121:%.*]] ], [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1290:%.*]] = phi ptr [ [[INCDEC_PTR89:%.*]], [[WHILE_END121]] ], [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CTT_0289:%.*]] = phi i32 [ [[ADD99:%.*]], [[WHILE_END121]] ], [ [[CONV48]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CFF_0288:%.*]] = phi i32 [ [[ADD106:%.*]], [[WHILE_END121]] ], [ [[CONV60]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CTF_0287:%.*]] = phi i32 [ [[ADD113:%.*]], [[WHILE_END121]] ], [ [[CONV72]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CFT_0286:%.*]] = phi i32 [ [[ADD120:%.*]], [[WHILE_END121]] ], [ [[CONV84]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1285:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END121]] ], [ [[AND]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP34:%.*]], [[WHILE_END121]] ], [ [[TMP17]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY93:%.*]]
 ; CHECK:       while.body93:
-; CHECK-NEXT:    [[_CTT_1283:%.*]] = phi i32 [ [[_CTT_0289]], [[WHILE_BODY88]] ], [ [[ADD99]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CFF_1282:%.*]] = phi i32 [ [[_CFF_0288]], [[WHILE_BODY88]] ], [ [[ADD106]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CTF_1281:%.*]] = phi i32 [ [[_CTF_0287]], [[WHILE_BODY88]] ], [ [[ADD113]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CFT_1280:%.*]] = phi i32 [ [[_CFT_0286]], [[WHILE_BODY88]] ], [ [[ADD120]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP10]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP11]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP19]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP20]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[SHIFT_0277:%.*]] = phi i32 [ 0, [[WHILE_BODY88]] ], [ [[INC:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i32> [ [[TMP18]], [[WHILE_BODY88]] ], [ [[TMP34]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[AND94:%.*]] = and i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[AND95:%.*]] = and i32 [[B_0278]], 1
 ; CHECK-NEXT:    [[SHR96]] = lshr i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[SHR97]] = lshr i32 [[B_0278]], 1
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0
-; CHECK-NEXT:    [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL98]], i1 false
-; CHECK-NEXT:    [[LAND_EXT:%.*]] = zext i1 [[TMP12]] to i32
-; CHECK-NEXT:    [[ADD99]] = add i32 [[_CTT_1283]], [[LAND_EXT]]
-; CHECK-NEXT:    [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0
-; CHECK-NEXT:    [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL103]], i1 false
-; CHECK-NEXT:    [[LAND_EXT105:%.*]] = zext i1 [[TMP13]] to i32
-; CHECK-NEXT:    [[ADD106]] = add i32 [[_CFF_1282]], [[LAND_EXT105]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL103]], i1 false
-; CHECK-NEXT:    [[LAND_EXT112:%.*]] = zext i1 [[TMP14]] to i32
-; CHECK-NEXT:    [[ADD113]] = add i32 [[_CTF_1281]], [[LAND_EXT112]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL98]], i1 false
-; CHECK-NEXT:    [[LAND_EXT119:%.*]] = zext i1 [[TMP15]] to i32
-; CHECK-NEXT:    [[ADD120]] = add i32 [[_CFT_1280]], [[LAND_EXT119]]
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[AND94]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
+; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[SHIFT_0277]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[WHILE_END121]], label [[WHILE_BODY93]]
@@ -144,61 +127,53 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    br i1 [[CMP86]], label [[WHILE_BODY88]], label [[WHILE_END122]]
 ; CHECK:       while.end122:
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1_LCSSA:%.*]] = phi i32 [ [[AND]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CFT_0_LCSSA:%.*]] = phi i32 [ [[CONV84]], [[WHILE_END]] ], [ [[ADD120]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CTF_0_LCSSA:%.*]] = phi i32 [ [[CONV72]], [[WHILE_END]] ], [ [[ADD113]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CFF_0_LCSSA:%.*]] = phi i32 [ [[CONV60]], [[WHILE_END]] ], [ [[ADD106]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CTT_0_LCSSA:%.*]] = phi i32 [ [[CONV48]], [[WHILE_END]] ], [ [[ADD99]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR89]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PA_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP17]], [[WHILE_END]] ], [ [[TMP34]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[CMP130_NOT299:%.*]] = icmp eq i32 [[NBBOOLBLOCK_1_LCSSA]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT299]], label [[WHILE_END166:%.*]], label [[WHILE_BODY132_PREHEADER:%.*]]
 ; CHECK:       while.body132.preheader:
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
 ; CHECK-NEXT:    [[SUB125:%.*]] = sub nuw nsw i32 32, [[NBBOOLBLOCK_1_LCSSA]]
-; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP16]], [[SUB125]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
-; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP17]], [[SUB125]]
+; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP36]], [[SUB125]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP37]], [[SUB125]]
 ; CHECK-NEXT:    br label [[WHILE_BODY132:%.*]]
 ; CHECK:       while.body132:
-; CHECK-NEXT:    [[_CTT_2306:%.*]] = phi i32 [ [[ADD142:%.*]], [[WHILE_BODY132]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CFF_2305:%.*]] = phi i32 [ [[ADD150:%.*]], [[WHILE_BODY132]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CTF_2304:%.*]] = phi i32 [ [[ADD157:%.*]], [[WHILE_BODY132]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CFT_2303:%.*]] = phi i32 [ [[ADD164:%.*]], [[WHILE_BODY132]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_2302:%.*]] = phi i32 [ [[DEC165:%.*]], [[WHILE_BODY132]] ], [ [[NBBOOLBLOCK_1_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[A_1301:%.*]] = phi i32 [ [[SHR135:%.*]], [[WHILE_BODY132]] ], [ [[SHR126]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[B_1300:%.*]] = phi i32 [ [[SHR136:%.*]], [[WHILE_BODY132]] ], [ [[SHR128]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP51:%.*]], [[WHILE_BODY132]] ], [ [[TMP35]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[AND133:%.*]] = and i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[AND134:%.*]] = and i32 [[B_1300]], 1
 ; CHECK-NEXT:    [[SHR135]] = lshr i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[SHR136]] = lshr i32 [[B_1300]], 1
-; CHECK-NEXT:    [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0
-; CHECK-NEXT:    [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL139]], i1 false
-; CHECK-NEXT:    [[LAND_EXT141:%.*]] = zext i1 [[TMP18]] to i32
-; CHECK-NEXT:    [[ADD142]] = add i32 [[_CTT_2306]], [[LAND_EXT141]]
-; CHECK-NEXT:    [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0
-; CHECK-NEXT:    [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL147]], i1 false
-; CHECK-NEXT:    [[LAND_EXT149:%.*]] = zext i1 [[TMP19]] to i32
-; CHECK-NEXT:    [[ADD150]] = add i32 [[_CFF_2305]], [[LAND_EXT149]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL147]], i1 false
-; CHECK-NEXT:    [[LAND_EXT156:%.*]] = zext i1 [[TMP20]] to i32
-; CHECK-NEXT:    [[ADD157]] = add i32 [[_CTF_2304]], [[LAND_EXT156]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL139]], i1 false
-; CHECK-NEXT:    [[LAND_EXT163:%.*]] = zext i1 [[TMP21]] to i32
-; CHECK-NEXT:    [[ADD164]] = add i32 [[_CFT_2303]], [[LAND_EXT163]]
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> poison, i32 [[AND133]], i32 0
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
+; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
 ; CHECK-NEXT:    [[DEC165]] = add nsw i32 [[NBBOOLBLOCK_2302]], -1
 ; CHECK-NEXT:    [[CMP130_NOT:%.*]] = icmp eq i32 [[DEC165]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
 ; CHECK:       while.end166:
-; CHECK-NEXT:    [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD164]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD157]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD150]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD142]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
+; CHECK-NEXT:    store i32 [[TMP53]], ptr [[CTT:%.*]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
+; CHECK-NEXT:    store i32 [[TMP54]], ptr [[CFF:%.*]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1
+; CHECK-NEXT:    store i32 [[TMP55]], ptr [[CTF:%.*]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0
+; CHECK-NEXT:    store i32 [[TMP56]], ptr [[CFT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
index de99654d84eb81..c2369a6a89ec1d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; YAML-NEXT:  Function:        foo
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-3'
+; YAML-NEXT:    - Cost:            '-4'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '10'
 ; YAML-NEXT:  ...
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 0217ddcac00468..4a9f717918a029 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -166,11 +166,31 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -266,11 +286,31 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -308,11 +348,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -334,11 +394,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -376,11 +456,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -448,5 +548,3 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index c38d116a7a323c..cac6845c43004f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -166,11 +166,31 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -266,11 +286,31 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -308,11 +348,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -334,11 +394,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -376,11 +456,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -448,5 +548,3 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index 39400ba4ce1e83..40b6a8c32f5d02 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -145,11 +145,31 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -171,11 +191,31 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -213,11 +253,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -239,11 +299,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -281,11 +361,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -354,9 +454,6 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
-; SLM: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 6b63de83c56be2..09113323d3ab77 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -145,11 +145,31 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -171,11 +191,31 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -213,11 +253,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -239,11 +299,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -281,11 +361,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -354,9 +454,6 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
-; SLM: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
index 35cb8c729e106f..94a1d7aa1951c4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
@@ -2,23 +2,24 @@
 ; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -pass-remarks-output=%t | FileCheck %s
 ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
 
-; YAML: --- !Passed
+; YAML: --- !Missed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Name:            NotBeneficial
 ; YAML-NEXT: Function:        g
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '-1'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '4'
+; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
+; YAML-NEXT:   - Cost:            '0'
+; YAML-NEXT:   - String:          ' >= '
+; YAML-NEXT:   - Treshold:        '0'
 
 define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) {
 ; CHECK-LABEL: @g(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 1
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i32 [[X1]], [[X1]]
+; CHECK-NEXT:    [[AB:%.*]] = mul i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i32> poison, i32 [[X1X1]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i32> [[INS1]], i32 [[AB]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[INS2]]
 ;
   %x1 = extractelement <2 x i32> %x, i32 1
   %x1x1 = mul i32 %x1, %x1

From 1c6102d19b445c6d17d3792dde0fc308efebc875 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 6 Nov 2023 15:28:51 +0000
Subject: [PATCH 39/40] [AMDGPU] Regenerate checks for
 long-branch-reserve-register.ll

---
 .../AMDGPU/long-branch-reserve-register.ll    | 47 ++++++++++---------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
index dc7d2eed53696e..fd543b96b0cf36 100644
--- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
+++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; OBJ:       Relocations [
@@ -54,32 +55,32 @@ bb3:
 define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
 ; GCN-LABEL: uniform_conditional_min_long_forward_branch:
 ; GCN:       ; %bb.0: ; %bb0
-; GCN-NEXT:  	 s_load_dword s2, s[0:1], 0xb
-; GCN-NEXT:  	 s_waitcnt lgkmcnt(0)
-; GCN-NEXT:  	 s_cmp_eq_u32 s2, 0
-; GCN-NEXT:  	 s_cbranch_scc0 .LBB1_1
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s2, 0
+; GCN-NEXT:    s_cbranch_scc0 .LBB1_1
 ; GCN-NEXT:  .LBB1_3: ; %bb0
-; GCN-NEXT:  	 s_getpc_b64 s[8:9]
+; GCN-NEXT:    s_getpc_b64 s[8:9]
 ; GCN-NEXT:  .Lpost_getpc0:
-; GCN-NEXT:  	 s_add_u32 s8, s8, (.LBB1_2-.Lpost_getpc0)&4294967295
-; GCN-NEXT:  	 s_addc_u32 s9, s9, (.LBB1_2-.Lpost_getpc0)>>32
-; GCN-NEXT:  	 s_setpc_b64 s[8:9]
+; GCN-NEXT:    s_add_u32 s8, s8, (.LBB1_2-.Lpost_getpc0)&4294967295
+; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB1_2-.Lpost_getpc0)>>32
+; GCN-NEXT:    s_setpc_b64 s[8:9]
 ; GCN-NEXT:  .LBB1_1: ; %bb2
-; GCN-NEXT:  	 ;;#ASMSTART
-; GCN-NEXT:  	 v_nop_e64
-; GCN-NEXT:     v_nop_e64
-; GCN-NEXT:     v_nop_e64
-; GCN-NEXT:     v_nop_e64
-; GCN-NEXT:  	 ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB1_2: ; %bb3
-; GCN-NEXT:  	 s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:  	 s_mov_b32 s7, 0xf000
-; GCN-NEXT:  	 s_mov_b32 s6, -1
-; GCN-NEXT:  	 v_mov_b32_e32 v0, s2
-; GCN-NEXT:  	 s_waitcnt lgkmcnt(0)
-; GCN-NEXT:  	 buffer_store_dword v0, off, s[4:7], 0
-; GCN-NEXT:  	 s_waitcnt vmcnt(0)
-; GCN-NEXT:  	 s_endpgm
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
 bb0:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
@@ -290,7 +291,7 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GCN-NEXT:    v_nop_e64
 ; GCN-NEXT:    v_nop_e64
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:  s_mov_b64 vcc, exec
+; GCN-NEXT:    s_mov_b64 vcc, exec
 ; GCN-NEXT:    s_cbranch_execnz .LBB5_5
 ; GCN-NEXT:  .LBB5_9: ; %bb3
 ; GCN-NEXT:    s_getpc_b64 s[4:5]

From 07f5cf1992e1388f94cd538810a58b49cb18c0b7 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <jplehr@users.noreply.github.com>
Date: Mon, 6 Nov 2023 16:45:03 +0100
Subject: [PATCH 40/40] [OpenMP][libomptarget] Fixes possible no-return warning
 (#70808)

The UNREACHABLE macro resolves to message + trap, which may still warn, so we add call to __builtin_unreachable.
---
 openmp/libomptarget/DeviceRTL/include/Debug.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h
index bd4d40e8f24fd5..22998f44a5bea5 100644
--- a/openmp/libomptarget/DeviceRTL/include/Debug.h
+++ b/openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -36,7 +36,8 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
   }
 #define UNREACHABLE(msg)                                                       \
   PRINT(msg);                                                                  \
-  __builtin_trap();
+  __builtin_trap();                                                            \
+  __builtin_unreachable();
 
 ///}