diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
index 277e6ec5b08900..878067aca0173f 100644
--- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
+++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
@@ -228,6 +228,11 @@ class ASTWalker : public RecursiveASTVisitor<ASTWalker> {
     // Mark declaration from definition as it needs type-checking.
     if (FD->isThisDeclarationADefinition())
       report(FD->getLocation(), FD);
+    // Explicit specializaiton/instantiations of a function template requires
+    // primary template.
+    if (clang::isTemplateExplicitInstantiationOrSpecialization(
+            FD->getTemplateSpecializationKind()))
+      report(FD->getLocation(), FD->getPrimaryTemplate());
     return true;
   }
   bool VisitVarDecl(VarDecl *VD) {
diff --git a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
index e238dc3d902bbe..5dc88157e13af0 100644
--- a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
@@ -229,13 +229,9 @@ TEST(WalkAST, FunctionTemplates) {
   EXPECT_THAT(testWalk("template<typename T> void foo(T) {}",
                        "template void ^foo<int>(int);"),
               ElementsAre());
-  // FIXME: Report specialized template as used from explicit specializations.
-  EXPECT_THAT(testWalk("template<typename T> void foo(T);",
+  EXPECT_THAT(testWalk("template<typename T> void $explicit^foo(T);",
                        "template<> void ^foo<int>(int);"),
-              ElementsAre());
-  EXPECT_THAT(testWalk("template<typename T> void foo(T) {}",
-                       "template<typename T> void ^foo(T*) {}"),
-              ElementsAre());
+              ElementsAre(Decl::FunctionTemplate));
 
   // Implicit instantiations references most relevant template.
   EXPECT_THAT(testWalk(R"cpp(
@@ -510,6 +506,8 @@ TEST(WalkAST, Functions) {
   // Definition uses declaration, not the other way around.
   testWalk("void $explicit^foo();", "void ^foo() {}");
   testWalk("void foo() {}", "void ^foo();");
+  testWalk("template <typename> void $explicit^foo();",
+           "template <typename> void ^foo() {}");
 
   // Unresolved calls marks all the overloads.
   testWalk("void $ambiguous^foo(int); void $ambiguous^foo(char);",
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index b151f8d0d7a79c..122b9045a75f6e 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -3213,12 +3213,6 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   // we haven't seen yet.
   if (Ctx.getLangOpts().CPlusPlus) {
     if (const auto *VD = dyn_cast<VarDecl>(D)) {
-      // Dummy for static locals
-      if (VD->isStaticLocal()) {
-        if (std::optional<unsigned> I = P.getOrCreateDummy(D))
-          return this->emitGetPtrGlobal(*I, E);
-        return false;
-      }
       // Visit local const variables like normal.
       if (VD->isLocalVarDecl() && VD->getType().isConstQualified()) {
         if (!this->visitVarDecl(VD))
@@ -3226,6 +3220,9 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
         // Retry.
         return this->VisitDeclRefExpr(E);
       }
+
+      if (VD->hasExternalStorage())
+        return this->emitInvalidDeclRef(E, E);
     }
   } else {
     if (const auto *VD = dyn_cast<VarDecl>(D);
@@ -3235,11 +3232,11 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
       // Retry.
       return this->VisitDeclRefExpr(E);
     }
-
-    if (std::optional<unsigned> I = P.getOrCreateDummy(D))
-      return this->emitGetPtrGlobal(*I, E);
   }
 
+  if (std::optional<unsigned> I = P.getOrCreateDummy(D))
+    return this->emitGetPtrGlobal(*I, E);
+
   return this->emitInvalidDeclRef(E, E);
 }
 
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 5670888c245eb1..4f3cd6cd21a151 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -285,10 +285,6 @@ static bool CheckConstant(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   return CheckConstant(S, OpPC, Ptr.getDeclDesc());
 }
 
-bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
-  return !Ptr.isDummy();
-}
-
 bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                CheckSubobjectKind CSK) {
   if (!Ptr.isZero())
@@ -595,10 +591,8 @@ bool CheckFloatResult(InterpState &S, CodePtr OpPC, const Floating &Result,
   return true;
 }
 
-/// We aleady know the given DeclRefExpr is invalid for some reason,
-/// now figure out why and print appropriate diagnostics.
-bool CheckDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR) {
-  const ValueDecl *D = DR->getDecl();
+static bool diagnoseUnknownDecl(InterpState &S, CodePtr OpPC,
+                                const ValueDecl *D) {
   const SourceInfo &E = S.Current->getSource(OpPC);
 
   if (isa<ParmVarDecl>(D)) {
@@ -621,10 +615,28 @@ bool CheckDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR) {
       return false;
     }
   }
-
   return false;
 }
 
+/// We aleady know the given DeclRefExpr is invalid for some reason,
+/// now figure out why and print appropriate diagnostics.
+bool CheckDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR) {
+  const ValueDecl *D = DR->getDecl();
+  return diagnoseUnknownDecl(S, OpPC, D);
+}
+
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
+  if (!Ptr.isDummy())
+    return true;
+
+  const Descriptor *Desc = Ptr.getDeclDesc();
+  const ValueDecl *D = Desc->asValueDecl();
+  if (!D)
+    return false;
+
+  return diagnoseUnknownDecl(S, OpPC, D);
+}
+
 bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
                       const CallExpr *CE, unsigned ArgSize) {
   auto Args = llvm::ArrayRef(CE->getArgs(), CE->getNumArgs());
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 13e004371f912c..f379c9869d8d8e 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -572,7 +572,8 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Inc(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-
+  if (Ptr.isDummy())
+    return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
     return false;
 
@@ -585,7 +586,8 @@ bool Inc(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool IncPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-
+  if (Ptr.isDummy())
+    return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
     return false;
 
@@ -599,7 +601,8 @@ bool IncPop(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Dec(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-
+  if (Ptr.isDummy())
+    return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
     return false;
 
@@ -612,7 +615,8 @@ bool Dec(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool DecPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-
+  if (Ptr.isDummy())
+    return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
     return false;
 
@@ -641,7 +645,8 @@ bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
 inline bool Incf(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-
+  if (Ptr.isDummy())
+    return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
     return false;
 
@@ -650,7 +655,8 @@ inline bool Incf(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
 
 inline bool IncfPop(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-
+  if (Ptr.isDummy())
+    return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
     return false;
 
@@ -660,6 +666,9 @@ inline bool IncfPop(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
 inline bool Decf(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
+  if (Ptr.isDummy())
+    return false;
+
   if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
     return false;
 
@@ -669,6 +678,8 @@ inline bool Decf(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
 inline bool DecfPop(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
+  if (Ptr.isDummy())
+    return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
     return false;
 
@@ -774,9 +785,9 @@ inline bool CmpHelperEQ<Pointer>(InterpState &S, CodePtr OpPC, CompareFn Fn) {
     // element in the same array are NOT equal. They have the same Base value,
     // but a different Offset. This is a pretty rare case, so we fix this here
     // by comparing pointers to the first elements.
-    if (LHS.isArrayRoot())
+    if (!LHS.isDummy() && LHS.isArrayRoot())
       VL = LHS.atIndex(0).getByteOffset();
-    if (RHS.isArrayRoot())
+    if (!RHS.isDummy() && RHS.isArrayRoot())
       VR = RHS.atIndex(0).getByteOffset();
 
     S.Stk.push<BoolT>(BoolT::from(Fn(Compare(VL, VR))));
@@ -1895,7 +1906,7 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (Ptr.isDummy())
     return true;
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
@@ -1909,7 +1920,7 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (!CheckDummy(S, OpPC, Ptr)) {
+  if (Ptr.isDummy()) {
     S.Stk.push<Pointer>(Ptr);
     return true;
   }
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index a4ee4c8435cac5..87cd95945669f2 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -7367,19 +7367,25 @@ void CodeGenFunction::EmitOMPInteropDirective(const OMPInteropDirective &S) {
                                      S.getSingleClause<OMPUseClause>())) &&
          "OMPNowaitClause clause is used separately in OMPInteropDirective.");
 
-  if (const auto *C = S.getSingleClause<OMPInitClause>()) {
-    llvm::Value *InteropvarPtr =
-        EmitLValue(C->getInteropVar()).getPointer(*this);
-    llvm::omp::OMPInteropType InteropType = llvm::omp::OMPInteropType::Unknown;
-    if (C->getIsTarget()) {
-      InteropType = llvm::omp::OMPInteropType::Target;
-    } else {
-      assert(C->getIsTargetSync() && "Expected interop-type target/targetsync");
-      InteropType = llvm::omp::OMPInteropType::TargetSync;
+  auto ItOMPInitClause = S.getClausesOfKind<OMPInitClause>();
+  if (!ItOMPInitClause.empty()) {
+    // Look at the multiple init clauses
+    for (const OMPInitClause *C : ItOMPInitClause) {
+      llvm::Value *InteropvarPtr =
+          EmitLValue(C->getInteropVar()).getPointer(*this);
+      llvm::omp::OMPInteropType InteropType =
+          llvm::omp::OMPInteropType::Unknown;
+      if (C->getIsTarget()) {
+        InteropType = llvm::omp::OMPInteropType::Target;
+      } else {
+        assert(C->getIsTargetSync() &&
+               "Expected interop-type target/targetsync");
+        InteropType = llvm::omp::OMPInteropType::TargetSync;
+      }
+      OMPBuilder.createOMPInteropInit(Builder, InteropvarPtr, InteropType,
+                                      Device, NumDependences, DependenceList,
+                                      Data.HasNowaitClause);
     }
-    OMPBuilder.createOMPInteropInit(Builder, InteropvarPtr, InteropType, Device,
-                                    NumDependences, DependenceList,
-                                    Data.HasNowaitClause);
   } else if (const auto *C = S.getSingleClause<OMPDestroyClause>()) {
     llvm::Value *InteropvarPtr =
         EmitLValue(C->getInteropVar()).getPointer(*this);
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index adbd04e8b50bf5..8106a39463b941 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4964,7 +4964,8 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
           if (RC->getModifier() != OMPC_REDUCTION_inscan)
             continue;
           for (Expr *E : RC->copy_array_temps())
-            MarkDeclarationsReferencedInExpr(E);
+            if (E)
+              MarkDeclarationsReferencedInExpr(E);
         }
         if (auto *AC = dyn_cast<OMPAlignedClause>(C)) {
           for (Expr *E : AC->varlists())
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index e1af2e80e3ad77..2bf6e9ef35119f 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -564,3 +564,8 @@ namespace LocalVLA {
 #endif
   }
 }
+
+char melchizedek[2200000000];
+typedef decltype(melchizedek[1] - melchizedek[0]) ptrdiff_t;
+constexpr ptrdiff_t d1 = &melchizedek[0x7fffffff] - &melchizedek[0]; // ok
+constexpr ptrdiff_t d3 = &melchizedek[0] - &melchizedek[0x80000000u]; // ok
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 2a72c24b43d1cd..260e5bdfeefb2b 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -33,15 +33,15 @@ const int b = 3;
 _Static_assert(b == 3, ""); // pedantic-ref-warning {{not an integer constant expression}} \
                             // pedantic-expected-warning {{not an integer constant expression}}
 
-/// FIXME: The new interpreter is missing the "initializer of 'c' unknown" diagnostics.
-const int c; // ref-note {{declared here}} \
-             // pedantic-ref-note {{declared here}}
+const int c; // all-note {{declared here}}
 _Static_assert(c == 0, ""); // ref-error {{not an integral constant expression}} \
                             // ref-note {{initializer of 'c' is unknown}} \
                             // pedantic-ref-error {{not an integral constant expression}} \
                             // pedantic-ref-note {{initializer of 'c' is unknown}} \
                             // expected-error {{not an integral constant expression}} \
-                            // pedantic-expected-error {{not an integral constant expression}}
+                            // expected-note {{initializer of 'c' is unknown}} \
+                            // pedantic-expected-error {{not an integral constant expression}} \
+                            // pedantic-expected-note {{initializer of 'c' is unknown}}
 
 _Static_assert(&c != 0, ""); // ref-warning {{always true}} \
                              // pedantic-ref-warning {{always true}} \
diff --git a/clang/test/AST/Interp/cxx98.cpp b/clang/test/AST/Interp/cxx98.cpp
index 1acc74a8290a06..73e45372066334 100644
--- a/clang/test/AST/Interp/cxx98.cpp
+++ b/clang/test/AST/Interp/cxx98.cpp
@@ -18,12 +18,13 @@ template struct C<cval>;
 
 /// FIXME: This example does not get properly diagnosed in the new interpreter.
 extern const int recurse1;
-const int recurse2 = recurse1; // ref-note {{here}}
+const int recurse2 = recurse1; // both-note {{declared here}}
 const int recurse1 = 1;
 int array1[recurse1];
 int array2[recurse2]; // ref-warning 2{{variable length array}} \
                       // ref-note {{initializer of 'recurse2' is not a constant expression}} \
                       // expected-warning {{variable length array}} \
+                      // expected-note {{read of non-const variable 'recurse2'}} \
                       // expected-error {{variable length array}}
 
 int NCI; // both-note {{declared here}}
diff --git a/clang/test/OpenMP/interop_codegen.cpp b/clang/test/OpenMP/interop_codegen.cpp
new file mode 100644
index 00000000000000..ea83ef8ed4909f
--- /dev/null
+++ b/clang/test/OpenMP/interop_codegen.cpp
@@ -0,0 +1,35 @@
+// expected-no-diagnostics
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+
+#ifndef HEADER
+#define HEADER
+
+typedef void *omp_interop_t;
+#define omp_interop_none 0
+#define omp_ipr_fr_id -1
+typedef long omp_intptr_t;
+#define NULL 0
+
+extern omp_intptr_t omp_get_interop_int(const omp_interop_t, int, int *);
+
+int main() {
+  omp_interop_t obj = omp_interop_none;
+  omp_interop_t i1 = omp_interop_none;
+  omp_interop_t i2 = omp_interop_none;
+  omp_interop_t i3 = omp_interop_none;
+  omp_interop_t i4 = omp_interop_none;
+  omp_interop_t i5 = omp_interop_none;
+
+  #pragma omp interop init(targetsync: i1) init(targetsync: obj)
+  int id = (int )omp_get_interop_int(obj, omp_ipr_fr_id, NULL);
+  int id1 = (int )omp_get_interop_int(i1, omp_ipr_fr_id, NULL);
+
+
+}
+#endif
+
+// CHECK-LABEL: define {{.+}}main{{.+}} 
+// CHECK: call {{.+}}__tgt_interop_init({{.+}}i1{{.*}})
+// CHECK: call {{.+}}__tgt_interop_init({{.+}}obj{{.*}})
diff --git a/clang/test/OpenMP/scan_ast_print.cpp b/clang/test/OpenMP/scan_ast_print.cpp
index 3bbd3b60c3e8c4..82cb13eb6e70f7 100644
--- a/clang/test/OpenMP/scan_ast_print.cpp
+++ b/clang/test/OpenMP/scan_ast_print.cpp
@@ -17,6 +17,10 @@ T tmain(T argc) {
   static T a;
 #pragma omp for reduction(inscan, +: a)
   for (int i = 0; i < 10; ++i) {
+#pragma omp scan inclusive(a)
+  }
+#pragma omp parallel for reduction(inscan, +:a)
+  for (int i = 0; i < 10; ++i) {
 #pragma omp scan inclusive(a)
   }
   return a + argc;
@@ -25,15 +29,29 @@ T tmain(T argc) {
 // CHECK-NEXT: #pragma omp for reduction(inscan, +: a)
 // CHECK-NEXT: for (int i = 0; i < 10; ++i) {
 // CHECK-NEXT: #pragma omp scan inclusive(a){{$}}
+
+// CHECK: #pragma omp parallel for reduction(inscan, +: a)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i) {
+// CHECK-NEXT: #pragma omp scan inclusive(a){{$}}
+
 // CHECK:      static int a;
 // CHECK-NEXT: #pragma omp for reduction(inscan, +: a)
 // CHECK-NEXT: for (int i = 0; i < 10; ++i) {
 // CHECK-NEXT: #pragma omp scan inclusive(a)
+
+// CHECK: #pragma omp parallel for reduction(inscan, +: a)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i) {
+// CHECK-NEXT: #pragma omp scan inclusive(a)
+
 // CHECK:      static char a;
 // CHECK-NEXT: #pragma omp for reduction(inscan, +: a)
 // CHECK-NEXT: for (int i = 0; i < 10; ++i) {
 // CHECK-NEXT: #pragma omp scan inclusive(a)
 
+// CHECK: #pragma omp parallel for reduction(inscan, +: a)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i) {
+// CHECK-NEXT: #pragma omp scan inclusive(a)
+
 int main(int argc, char **argv) {
   static int a;
 // CHECK: static int a;
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index ea81c662044306..664279cb123949 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -74,6 +74,7 @@
 // CHECK-NOT: __riscv_xventanacondops {{.*$}}
 // CHECK-NOT: __riscv_za128rs {{.*$}}
 // CHECK-NOT: __riscv_za64rs {{.*$}}
+// CHECK-NOT: __riscv_zacas {{.*$}}
 // CHECK-NOT: __riscv_zawrs {{.*$}}
 // CHECK-NOT: __riscv_zba {{.*$}}
 // CHECK-NOT: __riscv_zbb {{.*$}}
@@ -166,7 +167,6 @@
 // CHECK-NOT: __riscv_ssqosid{{.*$}}
 // CHECK-NOT: __riscv_supm{{.*$}}
 // CHECK-NOT: __riscv_zaamo {{.*$}}
-// CHECK-NOT: __riscv_zacas {{.*$}}
 // CHECK-NOT: __riscv_zalasr {{.*$}}
 // CHECK-NOT: __riscv_zalrsc {{.*$}}
 // CHECK-NOT: __riscv_zcmop {{.*$}}
@@ -660,6 +660,14 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZA64RS-EXT %s
 // CHECK-ZA64RS-EXT: __riscv_za64rs 1000000{{$}}
 
+// RUN: %clang --target=riscv32 \
+// RUN:   -march=rv32i_zacas1p0 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
+// RUN: %clang --target=riscv64 \
+// RUN:   -march=rv64i_zacas1p0 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
+// CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN:   -march=rv32izawrs -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZAWRS-EXT %s
@@ -1485,14 +1493,6 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZAAMO-EXT %s
 // CHECK-ZAAMO-EXT: __riscv_zaamo 2000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN:   -march=rv32i_zacas1p0 -E -dM %s \
-// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN:   -march=rv64i_zacas1p0 -E -dM %s \
-// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
-// CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}
-
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
 // RUN:   -march=rv32i_zalasr0p1 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZALASR-EXT %s
diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def
index 503acdac869c7a..9de69ac5c80f52 100644
--- a/flang/include/flang/Lower/LoweringOptions.def
+++ b/flang/include/flang/Lower/LoweringOptions.def
@@ -24,8 +24,8 @@ LOWERINGOPT(Name, Bits, Default)
 /// If true, lower transpose without a runtime call.
 ENUM_LOWERINGOPT(OptimizeTranspose, unsigned, 1, 1)
 
-/// If true, enable polymorphic type lowering feature. Off by default.
-ENUM_LOWERINGOPT(PolymorphicTypeImpl, unsigned, 1, 0)
+/// If true, enable polymorphic type lowering feature. On by default.
+ENUM_LOWERINGOPT(PolymorphicTypeImpl, unsigned, 1, 1)
 
 /// If true, lower to High level FIR before lowering to FIR. On by default.
 ENUM_LOWERINGOPT(LowerToHighLevelFIR, unsigned, 1, 1)
diff --git a/flang/test/Driver/flang-experimental-polymorphism-flag.f90 b/flang/test/Driver/flang-experimental-polymorphism-flag.f90
index 106e898149a18f..095c1cc929e67b 100644
--- a/flang/test/Driver/flang-experimental-polymorphism-flag.f90
+++ b/flang/test/Driver/flang-experimental-polymorphism-flag.f90
@@ -1,10 +1,10 @@
 ! Test -flang-experimental-hlfir flag
 ! RUN: %flang_fc1 -flang-experimental-polymorphism -emit-fir -o - %s | FileCheck %s
-! RUN: not %flang_fc1 -emit-fir -o - %s 2>&1 | FileCheck %s --check-prefix NO-POLYMORPHISM
+! RUN: %flang_fc1 -emit-fir -o - %s 2>&1 | FileCheck %s --check-prefix NO-POLYMORPHISM
 
 ! CHECK: func.func @_QPtest(%{{.*}}: !fir.class<none> {fir.bindc_name = "poly"})
 subroutine test(poly)
   class(*) :: poly
 end subroutine test
 
-! NO-POLYMORPHISM: not yet implemented: support for polymorphic types
+! NO-POLYMORPHISM: func.func @_QPtest
diff --git a/libc/cmake/modules/compiler_features/check_float128.cpp b/libc/cmake/modules/compiler_features/check_float128.cpp
index 8b1e3fe04ed4e1..20f889c14f997b 100644
--- a/libc/cmake/modules/compiler_features/check_float128.cpp
+++ b/libc/cmake/modules/compiler_features/check_float128.cpp
@@ -1,4 +1,4 @@
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 #ifndef LIBC_COMPILER_HAS_FLOAT128
 #error unsupported
diff --git a/libc/docs/dev/code_style.rst b/libc/docs/dev/code_style.rst
index eeeced0359adbc..c76f8874f3aef6 100644
--- a/libc/docs/dev/code_style.rst
+++ b/libc/docs/dev/code_style.rst
@@ -47,7 +47,7 @@ We define two kinds of macros:
        e.g., ``LIBC_COMPILER_IS_CLANG``.
      * ``cpu_features.h`` - Target cpu feature availability.
        e.g., ``LIBC_TARGET_CPU_HAS_AVX2``.
-     * ``float.h`` - Floating point type properties and availability.
+     * ``types.h`` - Type properties and availability.
        e.g., ``LIBC_COMPILER_HAS_FLOAT128``.
      * ``os.h`` - Target os properties.
        e.g., ``LIBC_TARGET_OS_IS_LINUX``.
diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt
index d747412791bd8e..6c35bc7090819e 100644
--- a/libc/src/__support/CPP/CMakeLists.txt
+++ b/libc/src/__support/CPP/CMakeLists.txt
@@ -153,10 +153,10 @@ add_header_library(
     type_traits/type_identity.h
     type_traits/void_t.h
   DEPENDS
+    libc.include.llvm-libc-macros.stdfix_macros
     libc.src.__support.macros.attributes
     libc.src.__support.macros.config
-    libc.src.__support.macros.properties.float
-    libc.include.llvm-libc-macros.stdfix_macros
+    libc.src.__support.macros.properties.types
 )
 
 add_header_library(
diff --git a/libc/src/__support/CPP/type_traits/is_floating_point.h b/libc/src/__support/CPP/type_traits/is_floating_point.h
index 3a5260bcab11ee..7f01cc41cae8fe 100644
--- a/libc/src/__support/CPP/type_traits/is_floating_point.h
+++ b/libc/src/__support/CPP/type_traits/is_floating_point.h
@@ -11,7 +11,7 @@
 #include "src/__support/CPP/type_traits/is_same.h"
 #include "src/__support/CPP/type_traits/remove_cv.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index 0c932e8ffcd550..f1c6fba22856dd 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -33,7 +33,7 @@ add_header_library(
     libc.src.__support.CPP.type_traits
     libc.src.__support.libc_assert
     libc.src.__support.macros.attributes
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.math_extras
     libc.src.__support.uint128
 )
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index fb5ff6a4153c90..1703e9a2bb3317 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -15,7 +15,7 @@
 #include "src/__support/common.h"
 #include "src/__support/libc_assert.h"       // LIBC_ASSERT
 #include "src/__support/macros/attributes.h" // LIBC_INLINE, LIBC_INLINE_VAR
-#include "src/__support/macros/properties/float.h" // LIBC_COMPILER_HAS_FLOAT128
+#include "src/__support/macros/properties/types.h" // LIBC_COMPILER_HAS_FLOAT128
 #include "src/__support/math_extras.h"             // mask_trailing_ones
 
 #include <stdint.h>
diff --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt
index 3c492ab55a90cb..bbc45650f3fca3 100644
--- a/libc/src/__support/macros/properties/CMakeLists.txt
+++ b/libc/src/__support/macros/properties/CMakeLists.txt
@@ -25,9 +25,9 @@ add_header_library(
 )
 
 add_header_library(
-  float
+  types
   HDRS
-    float.h
+    types.h
   DEPENDS
     .architectures
     .compiler
diff --git a/libc/src/__support/macros/properties/float.h b/libc/src/__support/macros/properties/types.h
similarity index 85%
rename from libc/src/__support/macros/properties/float.h
rename to libc/src/__support/macros/properties/types.h
index 510f3923749358..e812a9dfcfd8ab 100644
--- a/libc/src/__support/macros/properties/float.h
+++ b/libc/src/__support/macros/properties/types.h
@@ -1,15 +1,14 @@
-//===-- Float type support --------------------------------------*- C++ -*-===//
+//===-- Types support -------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Floating point properties are a combination of compiler support, target OS
-// and target architecture.
+// Types detection and support.
 
-#ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
-#define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
+#ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
+#define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
 
 #include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
 #include "llvm-libc-types/float128.h"      // float128
@@ -60,4 +59,4 @@ using float16 = _Float16;
 #define LIBC_COMPILER_HAS_FLOAT128
 #endif
 
-#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
+#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
diff --git a/libc/src/math/ceilf128.h b/libc/src/math/ceilf128.h
index db8feffc87ba2b..b0c4020718b29d 100644
--- a/libc/src/math/ceilf128.h
+++ b/libc/src/math/ceilf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_CEILF128_H
 #define LLVM_LIBC_SRC_MATH_CEILF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/copysignf128.h b/libc/src/math/copysignf128.h
index 0eda56a1cebbb0..06c194985d720b 100644
--- a/libc/src/math/copysignf128.h
+++ b/libc/src/math/copysignf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_COPYSIGNF128_H
 #define LLVM_LIBC_SRC_MATH_COPYSIGNF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/fabsf128.h b/libc/src/math/fabsf128.h
index 5999757decfdab..0a275025a5cfe7 100644
--- a/libc/src/math/fabsf128.h
+++ b/libc/src/math/fabsf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_FABSF128_H
 #define LLVM_LIBC_SRC_MATH_FABSF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/fdimf128.h b/libc/src/math/fdimf128.h
index c6f488a586dc03..f0485aba4822c5 100644
--- a/libc/src/math/fdimf128.h
+++ b/libc/src/math/fdimf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_FDIMF128_H
 #define LLVM_LIBC_SRC_MATH_FDIMF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/floorf128.h b/libc/src/math/floorf128.h
index 86b9a8e9265e11..b97c4b6c6ceced 100644
--- a/libc/src/math/floorf128.h
+++ b/libc/src/math/floorf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_FLOORF128_H
 #define LLVM_LIBC_SRC_MATH_FLOORF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/fmaxf128.h b/libc/src/math/fmaxf128.h
index 39eaaf616dd5d8..a4407d9655afa7 100644
--- a/libc/src/math/fmaxf128.h
+++ b/libc/src/math/fmaxf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_FMAXF128_H
 #define LLVM_LIBC_SRC_MATH_FMAXF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/fminf128.h b/libc/src/math/fminf128.h
index b3d1bec8e2ad92..d2ed593250a4af 100644
--- a/libc/src/math/fminf128.h
+++ b/libc/src/math/fminf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_FMINF128_H
 #define LLVM_LIBC_SRC_MATH_FMINF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/frexpf128.h b/libc/src/math/frexpf128.h
index 5d70860fa15599..55c4a47cc80ca3 100644
--- a/libc/src/math/frexpf128.h
+++ b/libc/src/math/frexpf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_FREXPF128_H
 #define LLVM_LIBC_SRC_MATH_FREXPF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 120ada8202ab9d..82d2a5e66af781 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -43,7 +43,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -216,7 +216,7 @@ add_entrypoint_object(
   HDRS
     ../fabsf128.h
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
@@ -267,7 +267,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -316,7 +316,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -365,7 +365,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -908,7 +908,7 @@ add_entrypoint_object(
   HDRS
     ../copysignf128.h
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -959,7 +959,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.manipulation_functions
 )
 
@@ -1008,7 +1008,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.manipulation_functions
 )
 
@@ -1057,7 +1057,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.manipulation_functions
 )
 
@@ -1106,7 +1106,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.manipulation_functions
 )
 
@@ -1412,7 +1412,7 @@ add_entrypoint_object(
   HDRS
     ../fminf128.h
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
@@ -1461,7 +1461,7 @@ add_entrypoint_object(
   HDRS
     ../fmaxf128.h
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
@@ -1510,7 +1510,7 @@ add_entrypoint_object(
   HDRS
     ../sqrtf128.h
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.sqrt
   COMPILE_OPTIONS
     -O3
@@ -1647,7 +1647,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-    libc.src.__support.macros.properties.float
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.basic_operations
 )
 
diff --git a/libc/src/math/ilogbf128.h b/libc/src/math/ilogbf128.h
index df1145ffc0f8ab..d8fe3b970973c4 100644
--- a/libc/src/math/ilogbf128.h
+++ b/libc/src/math/ilogbf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_ILOGBF128_H
 #define LLVM_LIBC_SRC_MATH_ILOGBF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/ldexpf128.h b/libc/src/math/ldexpf128.h
index adf9d8f56b3566..7aa6ded3c8e4c6 100644
--- a/libc/src/math/ldexpf128.h
+++ b/libc/src/math/ldexpf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_LDEXPF128_H
 #define LLVM_LIBC_SRC_MATH_LDEXPF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/llogb.h b/libc/src/math/llogb.h
index 2d95877425e568..b51f89fc0416ee 100644
--- a/libc/src/math/llogb.h
+++ b/libc/src/math/llogb.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_LLOGB_H
 #define LLVM_LIBC_SRC_MATH_LLOGB_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/llogbf.h b/libc/src/math/llogbf.h
index 512e174b66ee46..af4aa8a5b15c0b 100644
--- a/libc/src/math/llogbf.h
+++ b/libc/src/math/llogbf.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_LLOGBF_H
 #define LLVM_LIBC_SRC_MATH_LLOGBF_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/llogbf128.h b/libc/src/math/llogbf128.h
index 7fb74d4bbe7302..ce7c872a63db4e 100644
--- a/libc/src/math/llogbf128.h
+++ b/libc/src/math/llogbf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_LLOGBF128_H
 #define LLVM_LIBC_SRC_MATH_LLOGBF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/llogbl.h b/libc/src/math/llogbl.h
index 4033100fbe3dae..3c323a3af2a93f 100644
--- a/libc/src/math/llogbl.h
+++ b/libc/src/math/llogbl.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_LLOGBL_H
 #define LLVM_LIBC_SRC_MATH_LLOGBL_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/logbf128.h b/libc/src/math/logbf128.h
index 8baa076af1bfdb..7823bbd615b89a 100644
--- a/libc/src/math/logbf128.h
+++ b/libc/src/math/logbf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_LOGBF128_H
 #define LLVM_LIBC_SRC_MATH_LOGBF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/roundf128.h b/libc/src/math/roundf128.h
index c67c946cc5e8be..e4aca17d7eb637 100644
--- a/libc/src/math/roundf128.h
+++ b/libc/src/math/roundf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_ROUNDF128_H
 #define LLVM_LIBC_SRC_MATH_ROUNDF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/sqrtf128.h b/libc/src/math/sqrtf128.h
index bccb6bbb6332da..9da9eb69374cb8 100644
--- a/libc/src/math/sqrtf128.h
+++ b/libc/src/math/sqrtf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_SQRTF128_H
 #define LLVM_LIBC_SRC_MATH_SQRTF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/math/truncf128.h b/libc/src/math/truncf128.h
index c92c8202d4eef7..5eb6116551d1cb 100644
--- a/libc/src/math/truncf128.h
+++ b/libc/src/math/truncf128.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_TRUNCF128_H
 #define LLVM_LIBC_SRC_MATH_TRUNCF128_H
 
-#include "src/__support/macros/properties/float.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libcxx/cmake/config-ix.cmake b/libcxx/cmake/config-ix.cmake
index 1e8c2f5ce46321..7406fba482e69d 100644
--- a/libcxx/cmake/config-ix.cmake
+++ b/libcxx/cmake/config-ix.cmake
@@ -1,5 +1,6 @@
 include(CMakePushCheckState)
 include(CheckLibraryExists)
+include(CheckSymbolExists)
 include(LLVMCheckCompilerLinkerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
@@ -97,6 +98,8 @@ int main(void) { return 0; }
   cmake_pop_check_state()
 endif()
 
+check_symbol_exists(__PICOLIBC__ "string.h" PICOLIBC)
+
 # Check libraries
 if(WIN32 AND NOT MINGW)
   # TODO(compnerd) do we want to support an emulation layer that allows for the
@@ -116,6 +119,10 @@ elseif(ANDROID)
   set(LIBCXX_HAS_PTHREAD_LIB NO)
   set(LIBCXX_HAS_RT_LIB NO)
   set(LIBCXX_HAS_ATOMIC_LIB NO)
+elseif(PICOLIBC)
+  set(LIBCXX_HAS_PTHREAD_LIB NO)
+  set(LIBCXX_HAS_RT_LIB NO)
+  set(LIBCXX_HAS_ATOMIC_LIB NO)
 else()
   check_library_exists(pthread pthread_create "" LIBCXX_HAS_PTHREAD_LIB)
   check_library_exists(rt clock_gettime "" LIBCXX_HAS_RT_LIB)
diff --git a/libcxx/test/std/time/time.clock/time.clock.file/to_from_sys.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.file/to_from_sys.pass.cpp
index b1031c81561047..5b1f4659911118 100644
--- a/libcxx/test/std/time/time.clock/time.clock.file/to_from_sys.pass.cpp
+++ b/libcxx/test/std/time/time.clock/time.clock.file/to_from_sys.pass.cpp
@@ -10,9 +10,6 @@
 
 // UNSUPPORTED: availability-filesystem-missing
 
-// "unable to find library from dependent library specifier: rt"
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // <chrono>
 //
 // file_clock
diff --git a/libcxx/test/std/time/time.clock/time.clock.hires/now.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.hires/now.pass.cpp
index 8625ac58bde559..db1fb55df90721 100644
--- a/libcxx/test/std/time/time.clock/time.clock.hires/now.pass.cpp
+++ b/libcxx/test/std/time/time.clock/time.clock.hires/now.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// "unable to find library from dependent library specifier: rt"
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // <chrono>
 
 // high_resolution_clock
diff --git a/libcxx/test/std/time/time.clock/time.clock.system/from_time_t.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.system/from_time_t.pass.cpp
index 5ff667445b1a39..70dd8117e6cef5 100644
--- a/libcxx/test/std/time/time.clock/time.clock.system/from_time_t.pass.cpp
+++ b/libcxx/test/std/time/time.clock/time.clock.system/from_time_t.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// "unable to find library from dependent library specifier: rt"
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // <chrono>
 
 // system_clock
diff --git a/libcxx/test/std/time/time.clock/time.clock.system/now.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.system/now.pass.cpp
index 70fbe98d8dfd12..dade6bafa471bb 100644
--- a/libcxx/test/std/time/time.clock/time.clock.system/now.pass.cpp
+++ b/libcxx/test/std/time/time.clock/time.clock.system/now.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// "unable to find library from dependent library specifier: rt"
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // <chrono>
 
 // system_clock
diff --git a/libcxx/test/std/time/time.clock/time.clock.system/to_time_t.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.system/to_time_t.pass.cpp
index f3238f7bb1bb52..bf4339c32d1ca9 100644
--- a/libcxx/test/std/time/time.clock/time.clock.system/to_time_t.pass.cpp
+++ b/libcxx/test/std/time/time.clock/time.clock.system/to_time_t.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// "unable to find library from dependent library specifier: rt"
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // <chrono>
 
 // system_clock
diff --git a/libcxx/test/std/time/time.point/time.point.nonmember/op_-duration.pass.cpp b/libcxx/test/std/time/time.point/time.point.nonmember/op_-duration.pass.cpp
index 199bdec66878a2..80e9d04a769fde 100644
--- a/libcxx/test/std/time/time.point/time.point.nonmember/op_-duration.pass.cpp
+++ b/libcxx/test/std/time/time.point/time.point.nonmember/op_-duration.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// "unable to find library from dependent library specifier: rt"
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // <chrono>
 
 // time_point
diff --git a/lldb/source/Symbol/Variable.cpp b/lldb/source/Symbol/Variable.cpp
index 2bb2ff7db4b721..a33c3433d9e245 100644
--- a/lldb/source/Symbol/Variable.cpp
+++ b/lldb/source/Symbol/Variable.cpp
@@ -509,15 +509,17 @@ static void PrivateAutoCompleteMembers(
       CompilerType member_compiler_type = compiler_type.GetFieldAtIndex(
           i, member_name, nullptr, nullptr, nullptr);
 
-      if (partial_member_name.empty() ||
-          llvm::StringRef(member_name).starts_with(partial_member_name)) {
+      if (partial_member_name.empty()) {
+        request.AddCompletion((prefix_path + member_name).str());
+      } else if (llvm::StringRef(member_name)
+                     .starts_with(partial_member_name)) {
         if (member_name == partial_member_name) {
           PrivateAutoComplete(
               frame, partial_path,
               prefix_path + member_name, // Anything that has been resolved
                                          // already will be in here
               member_compiler_type.GetCanonicalType(), request);
-        } else {
+        } else if (partial_path.empty()) {
           request.AddCompletion((prefix_path + member_name).str());
         }
       }
diff --git a/lldb/test/API/functionalities/completion/TestCompletion.py b/lldb/test/API/functionalities/completion/TestCompletion.py
index f71bc73928f0f4..0d6907e0c3d229 100644
--- a/lldb/test/API/functionalities/completion/TestCompletion.py
+++ b/lldb/test/API/functionalities/completion/TestCompletion.py
@@ -60,10 +60,12 @@ def test_dwim_print(self):
 
     def do_test_variable_completion(self, command):
         self.complete_from_to(f"{command} fo", f"{command} fooo")
-        self.complete_from_to(f"{command} fooo.", f"{command} fooo.")
+        self.complete_from_to(f"{command} fooo.", f"{command} fooo.t")
+        self.complete_from_to(f"{command} fooo.t.", f"{command} fooo.t.x")
         self.complete_from_to(f"{command} fooo.dd", f"{command} fooo.dd")
 
-        self.complete_from_to(f"{command} ptr_fooo->", f"{command} ptr_fooo->")
+        self.complete_from_to(f"{command} ptr_fooo->", f"{command} ptr_fooo->t")
+        self.complete_from_to(f"{command} ptr_fooo->t.", f"{command} ptr_fooo->t.x")
         self.complete_from_to(f"{command} ptr_fooo->dd", f"{command} ptr_fooo->dd")
 
         self.complete_from_to(f"{command} cont", f"{command} container")
diff --git a/lldb/test/API/functionalities/completion/main.cpp b/lldb/test/API/functionalities/completion/main.cpp
index 06ff5773e8a9dc..f925c1d5acf31c 100644
--- a/lldb/test/API/functionalities/completion/main.cpp
+++ b/lldb/test/API/functionalities/completion/main.cpp
@@ -1,12 +1,17 @@
 #include <iostream>
 
+class Baz {
+public:
+  int x;
+};
+
 class Foo
 {
 public:
-    int Bar(int x, int y)
-    {
-        return x + y;
-    }
+  Baz t;
+  int temp;
+
+  int Bar(int x, int y) { return x + y; }
 };
 
 namespace { int Quux (void) { return 0; } }
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 26ff34376fb838..33b0152bd7b49c 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -536,15 +536,15 @@ G_FMINIMUM
 ^^^^^^^^^^
 
 NaN-propagating minimum that also treat -0.0 as less than 0.0. While
-FMINNUM_IEEE follow IEEE 754-2008 semantics, FMINIMUM follows IEEE 754-2018
-draft semantics.
+FMINNUM_IEEE follow IEEE 754-2008 semantics, FMINIMUM follows IEEE
+754-2019 semantics.
 
 G_FMAXIMUM
 ^^^^^^^^^^
 
 NaN-propagating maximum that also treat -0.0 as less than 0.0. While
-FMAXNUM_IEEE follow IEEE 754-2008 semantics, FMAXIMUM follows IEEE 754-2018
-draft semantics.
+FMAXNUM_IEEE follow IEEE 754-2008 semantics, FMAXIMUM follows IEEE
+754-2019 semantics.
 
 G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FREM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 4893cb9e65cda6..6cd3228cef66c7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15585,7 +15585,7 @@ Semantics:
 If either operand is a NaN, returns NaN. Otherwise returns the lesser
 of the two arguments. -0.0 is considered to be less than +0.0 for this
 intrinsic. Note that these are the semantics specified in the draft of
-IEEE 754-2018.
+IEEE 754-2019.
 
 .. _i_maximum:
 
@@ -15625,7 +15625,7 @@ Semantics:
 If either operand is a NaN, returns NaN. Otherwise returns the greater
 of the two arguments. -0.0 is considered to be less than +0.0 for this
 intrinsic. Note that these are the semantics specified in the draft of
-IEEE 754-2018.
+IEEE 754-2019.
 
 .. _int_copysign:
 
@@ -26004,7 +26004,7 @@ The third argument specifies the exception behavior as described above.
 Semantics:
 """"""""""
 
-This function follows semantics specified in the draft of IEEE 754-2018.
+This function follows semantics specified in the draft of IEEE 754-2019.
 
 
 '``llvm.experimental.constrained.minimum``' Intrinsic
@@ -26036,7 +26036,7 @@ The third argument specifies the exception behavior as described above.
 Semantics:
 """"""""""
 
-This function follows semantics specified in the draft of IEEE 754-2018.
+This function follows semantics specified in the draft of IEEE 754-2019.
 
 
 '``llvm.experimental.constrained.ceil``' Intrinsic
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index ed443596897aea..8d293b02144307 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -117,6 +117,7 @@ on support follow.
      ``V``             Supported
      ``Za128rs``       Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Za64rs``        Supported (`See note <#riscv-profiles-extensions-note>`__)
+     ``Zacas``         Supported (`See note <#riscv-zacas-note>`__)
      ``Zawrs``         Assembly Support
      ``Zba``           Supported
      ``Zbb``           Supported
@@ -236,6 +237,11 @@ Supported
 ``Za128rs``, ``Za64rs``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare``
   These extensions are defined as part of the `RISC-V Profiles specification <https://github.com/riscv/riscv-profiles/releases/tag/v1.0>`__.  They do not introduce any new features themselves, but instead describe existing hardware features.
 
+  .. _riscv-zacas-note:
+
+``Zacas``
+  amocas.w will be used for i32 cmpxchg. amocas.d will be used i64 cmpxchg on RV64. The compiler will not generate amocas.d on RV32 or amocas.q on RV64 due to ABI compatibilty. These can only be used in the assembler.
+
 Experimental Extensions
 =======================
 
@@ -252,9 +258,6 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-zabha``
   LLVM implements the `v1.0-rc1 draft specification <https://github.com/riscv/riscv-zabha/tree/v1.0-rc1>`__.
 
-``experimental-zacas``
-  LLVM implements the `1.0-rc1 draft specification <https://github.com/riscv/riscv-zacas/releases/tag/v1.0-rc1>`__.
-
 ``experimental-zalasr``
   LLVM implements the `0.0.5 draft specification <https://github.com/mehnadnerd/riscv-zalasr>`__.
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 51b6527f65bb04..8ce6ee5cebb266 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -103,6 +103,7 @@ Changes to the RISC-V Backend
 * Codegen support was added for the Zimop (May-Be-Operations) extension.
 * The experimental Ssnpm, Smnpm, Smmpm, Sspm, and Supm 0.8.1 Pointer Masking extensions are supported.
 * The experimental Ssqosid extension is supported.
+* Zacas is no longer experimental.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 8c247bbcec90a2..deb74cb2fdeb1e 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -1389,29 +1389,35 @@ inline APFloat neg(APFloat X) {
   return X;
 }
 
-/// Implements IEEE minNum semantics. Returns the smaller of the 2 arguments if
-/// both are not NaN. If either argument is a NaN, returns the other argument.
+/// Implements IEEE-754 2019 minimumNumber semantics. Returns the smaller of the
+/// 2 arguments if both are not NaN. If either argument is a NaN, returns the
+/// other argument. -0 is treated as ordered less than +0.
 LLVM_READONLY
 inline APFloat minnum(const APFloat &A, const APFloat &B) {
   if (A.isNaN())
     return B;
   if (B.isNaN())
     return A;
+  if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative()))
+    return A.isNegative() ? A : B;
   return B < A ? B : A;
 }
 
-/// Implements IEEE maxNum semantics. Returns the larger of the 2 arguments if
-/// both are not NaN. If either argument is a NaN, returns the other argument.
+/// Implements IEEE-754 2019 maximumNumber semantics. Returns the larger of the
+/// 2 arguments if both are not NaN. If either argument is a NaN, returns the
+/// other argument. +0 is treated as ordered greater than -0.
 LLVM_READONLY
 inline APFloat maxnum(const APFloat &A, const APFloat &B) {
   if (A.isNaN())
     return B;
   if (B.isNaN())
     return A;
+  if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative()))
+    return A.isNegative() ? B : A;
   return A < B ? B : A;
 }
 
-/// Implements IEEE 754-2018 minimum semantics. Returns the smaller of 2
+/// Implements IEEE 754-2019 minimum semantics. Returns the smaller of 2
 /// arguments, propagating NaNs and treating -0 as less than +0.
 LLVM_READONLY
 inline APFloat minimum(const APFloat &A, const APFloat &B) {
@@ -1424,7 +1430,7 @@ inline APFloat minimum(const APFloat &A, const APFloat &B) {
   return B < A ? B : A;
 }
 
-/// Implements IEEE 754-2018 maximum semantics. Returns the larger of 2
+/// Implements IEEE 754-2019 maximum semantics. Returns the larger of 2
 /// arguments, propagating NaNs and treating -0 as less than +0.
 LLVM_READONLY
 inline APFloat maximum(const APFloat &A, const APFloat &B) {
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 8cb0bc9fd98133..ad876c5db4509a 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -978,7 +978,7 @@ enum NodeType {
 
   /// FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0
   /// as less than 0.0. While FMINNUM_IEEE/FMAXNUM_IEEE follow IEEE 754-2008
-  /// semantics, FMINIMUM/FMAXIMUM follow IEEE 754-2018 draft semantics.
+  /// semantics, FMINIMUM/FMAXIMUM follow IEEE 754-2019 semantics.
   FMINIMUM,
   FMAXIMUM,
 
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 4c2559680e7d5a..c803f598512016 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -17,7 +17,7 @@
 /* The number of commits in the linear history from the
  * start of the universe up to the latest llvm main commit
  * that has been merged */
-#define LLVM_MAIN_REVISION 491048
+#define LLVM_MAIN_REVISION 491120
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 19197f50d9dff9..d2036e478d18f2 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -815,7 +815,7 @@ def G_FMAXNUM_IEEE : GenericInstruction {
 
 // FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0
 // as less than 0.0. While FMINNUM_IEEE/FMAXNUM_IEEE follow IEEE 754-2008
-// semantics, FMINIMUM/FMAXIMUM follow IEEE 754-2018 draft semantics.
+// semantics, FMINIMUM/FMAXIMUM follow IEEE 754-2019 semantics.
 def G_FMINIMUM : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 17757ca3e41111..18db7a819540af 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -952,6 +952,37 @@ def redundant_binop_in_equality : GICombineRule<
          [{ return Helper.matchRedundantBinOpInEquality(*${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
 
+// Transform: (X == 0 & Y == 0) -> (X | Y) == 0
+def double_icmp_zero_and_combine: GICombineRule<
+  (defs root:$root),
+  (match (G_ICMP $d1, $p, $s1, 0),
+         (G_ICMP $d2, $p, $s2, 0),
+         (G_AND $root, $d1, $d2),
+         [{ return ${p}.getPredicate() == CmpInst::ICMP_EQ &&
+                       !MRI.getType(${s1}.getReg()).getScalarType().isPointer() &&
+                       (MRI.getType(${s1}.getReg()) ==
+                           MRI.getType(${s2}.getReg())); }]),
+  (apply (G_OR $ordst, $s1, $s2),
+         (G_ICMP $root, $p, $ordst, 0))
+>;
+
+// Transform: (X != 0 | Y != 0) -> (X | Y) != 0
+def double_icmp_zero_or_combine: GICombineRule<
+  (defs root:$root),
+  (match (G_ICMP $d1, $p, $s1, 0),
+         (G_ICMP $d2, $p, $s2, 0),
+         (G_OR $root, $d1, $d2),
+         [{ return ${p}.getPredicate() == CmpInst::ICMP_NE &&
+                       !MRI.getType(${s1}.getReg()).getScalarType().isPointer() &&
+                       (MRI.getType(${s1}.getReg()) ==
+                           MRI.getType(${s2}.getReg())); }]),
+  (apply (G_OR $ordst, $s1, $s2),
+         (G_ICMP $root, $p, $ordst, 0))
+>;
+
+def double_icmp_zero_and_or_combine : GICombineGroup<[double_icmp_zero_and_combine,
+                                                      double_icmp_zero_or_combine]>;
+
 def and_or_disjoint_mask : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$info),
   (match (wip_match_opcode G_AND):$root,
@@ -1343,7 +1374,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, 
-    combine_concat_vector]>;
+    combine_concat_vector, double_icmp_zero_and_or_combine]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index d028302b8c4d94..68f5c36e8fafc6 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -109,6 +109,7 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
 
     {"za128rs", {1, 0}},
     {"za64rs", {1, 0}},
+    {"zacas", {1, 0}},
     {"zawrs", {1, 0}},
 
     {"zba", {1, 0}},
@@ -220,7 +221,6 @@ static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
 
     {"zaamo", {0, 2}},
     {"zabha", {1, 0}},
-    {"zacas", {1, 0}},
     {"zalasr", {0, 1}},
     {"zalrsc", {0, 2}},
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index b2fc7d874fe588..6edf01d1217f2d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -856,14 +856,6 @@ bool isReadOnlySegment(const GlobalValue *GV);
 /// target triple \p TT, false otherwise.
 bool shouldEmitConstantsToTextSection(const Triple &TT);
 
-/// \returns Integer value requested using \p F's \p Name attribute.
-///
-/// \returns \p Default if attribute is not present.
-///
-/// \returns \p Default and emits error if requested value cannot be converted
-/// to integer.
-int getIntegerAttribute(const Function &F, StringRef Name, int Default);
-
 /// \returns A pair of integer values requested using \p F's \p Name attribute
 /// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
 /// is false).
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index e78ea63c3b4a56..f0b69b0b09809f 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1468,21 +1468,15 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8(
   if (passesFPReg)
     assert(STI->hasFPRegs() && "Subtarget needs fpregs");
 
-  // Lazy store all fp registers to the stack
+  // Lazy store all fp registers to the stack.
   // This executes as NOP in the absence of floating-point support.
-  MachineInstrBuilder VLSTM =
-      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
-          .addReg(ARM::SP)
-          .add(predOps(ARMCC::AL))
-          .addImm(0); // Represents a pseoudo register list, has no effect on
-                      // the encoding.
-  // Mark non-live registers as undef
-  for (MachineOperand &MO : VLSTM->implicit_operands()) {
-    if (MO.isReg() && !MO.isDef()) {
-      Register Reg = MO.getReg();
-      MO.setIsUndef(!LiveRegs.contains(Reg));
-    }
-  }
+  MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+                                  .addReg(ARM::SP)
+                                  .add(predOps(ARMCC::AL));
+  for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
+                 ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
+    VLSTM.addReg(R, RegState::Implicit |
+                        (LiveRegs.contains(R) ? 0 : RegState::Undef));
 
   // Restore all arguments
   for (const auto &Regs : ClearedFPRegs) {
@@ -1569,20 +1563,14 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV81(MachineBasicBlock &MBB,
         .addImm(CMSE_FP_SAVE_SIZE >> 2)
         .add(predOps(ARMCC::AL));
 
-    // Lazy store all fp registers to the stack.
-    MachineInstrBuilder VLSTM =
-        BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
-            .addReg(ARM::SP)
-            .add(predOps(ARMCC::AL))
-            .addImm(0); // Represents a pseoudo register list, has no effect on
-                        // the encoding.
-    // Mark non-live registers as undef
-    for (MachineOperand &MO : VLSTM->implicit_operands()) {
-      if (MO.isReg() && MO.isImplicit() && !MO.isDef()) {
-        Register Reg = MO.getReg();
-        MO.setIsUndef(!LiveRegs.contains(Reg));
-      }
-    }
+    // Lazy store all FP registers to the stack
+    MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+                                    .addReg(ARM::SP)
+                                    .add(predOps(ARMCC::AL));
+    for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
+                   ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
+      VLSTM.addReg(R, RegState::Implicit |
+                          (LiveRegs.contains(R) ? 0 : RegState::Undef));
   } else {
     // Push all the callee-saved registers (s16-s31).
     MachineInstrBuilder VPUSH =
@@ -1685,12 +1673,9 @@ void ARMExpandPseudo::CMSERestoreFPRegsV8(
 
   // Lazy load fp regs from stack.
   // This executes as NOP in the absence of floating-point support.
-  MachineInstrBuilder VLLDM =
-      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
-          .addReg(ARM::SP)
-          .add(predOps(ARMCC::AL))
-          .addImm(0); // Represents a pseoudo register list, has no effect on
-                      // the encoding.
+  MachineInstrBuilder VLLDM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
+                                  .addReg(ARM::SP)
+                                  .add(predOps(ARMCC::AL));
 
   if (STI->fixCMSE_CVE_2021_35465()) {
     auto Bundler = MIBundleBuilder(MBB, VLLDM);
@@ -1772,9 +1757,7 @@ void ARMExpandPseudo::CMSERestoreFPRegsV81(
     // Load FP registers from stack.
     BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
         .addReg(ARM::SP)
-        .add(predOps(ARMCC::AL))
-        .addImm(0); // Represents a pseoudo register list, has no effect on the
-                    // encoding.
+        .add(predOps(ARMCC::AL));
 
     // Pop the stack space
     BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP)
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 404085820a6660..14e315534570d2 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -1749,37 +1749,6 @@ class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
   let Inst{8}     = 0;          // Single precision
 }
 
-// Single Precision with fixed registers.
-// For when the registers-to-be-stored/loaded are fixed, e.g. VLLDM and VLSTM
-class AXSI4FR<string asm, bit et, bit load>
-    : InstARM<AddrMode4, 4, IndexModeNone, VFPLdStMulFrm, VFPDomain, "", NoItinerary> {
-  // Instruction operands.
-  bits<4> Rn;
-  bits<13> regs;    // Does not affect encoding, for assembly/disassembly only.
-  list<Predicate> Predicates = [HasVFP2];
-  let OutOperandList = (outs);
-  let InOperandList = (ins GPRnopc:$Rn, pred:$p, dpr_reglist:$regs);
-  let AsmString = asm;
-  let Pattern = [];
-  let DecoderNamespace = "VFP";
-  // Encode instruction operands.
-  let Inst{19-16} = Rn;
-  let Inst{31-28} = 0b1110;
-  let Inst{27-25} = 0b110;
-  let Inst{24}    = 0b0;
-  let Inst{23}    = 0b0;
-  let Inst{22}    = 0b0;
-  let Inst{21}    = 0b1;
-  let Inst{20}    = load;       // Distinguishes vlldm from vlstm
-  let Inst{15-12} = 0b0000;
-  let Inst{11-9}  = 0b101;
-  let Inst{8}     = 0;          // Single precision
-  let Inst{7}     = et;         // encoding type, 0 for T1 and 1 for T2.
-  let Inst{6-0}   = 0b0000000;
-  let mayLoad     = load;
-  let mayStore    = !eq(load, 0);
-}
-
 // Double precision, unary
 class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 3094a4db2b4d12..55d3efbd9b9a2b 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -313,51 +313,29 @@ def : MnemonicAlias<"vstm", "vstmia">;
 //===----------------------------------------------------------------------===//
 //  Lazy load / store multiple Instructions
 //
-// VLLDM and VLSTM:
-// 2 encoding options:
-// T1 (bit 7 is 0):
-// T1 takes an optional dpr_reglist, must be '{d0-d15}' (exactly)
-// T1 require v8-M.Main, secure state, target with 16 D registers (or with no D registers - NOP)
-// T2 (bit 7 is 1):
-// T2 takes a mandatory dpr_reglist, must be '{d0-d31}' (exactly)
-// T2 require v8.1-M.Main, secure state, target with 16/32 D registers (or with no D registers - NOP)
-// (source: Arm v8-M ARM, DDI0553B.v ID16122022)
-
-def VLLDM : AXSI4FR<"vlldm${p}\t$Rn, $regs", 0, 1>,
+def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
+                  NoItinerary, "vlldm${p}\t$Rn", "", []>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
-    let Defs = [VPR, FPSCR, FPSCR_NZCV, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15];
-    let DecoderMethod = "DecodeLazyLoadStoreMul";
-}
-// T1: assembly does not contains the register list.
-def : InstAlias<"vlldm${p}\t$Rn", (VLLDM GPRnopc:$Rn, pred:$p, 0)>,
-                Requires<[HasV8MMainline, Has8MSecExt]>;
-// T2: assembly must contains the register list.
-// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
-def VLLDM_T2 : AXSI4FR<"vlldm${p}\t$Rn, $regs", 1, 1>,
-            Requires<[HasV8_1MMainline, Has8MSecExt]> {
-    let Defs = [VPR, FPSCR, FPSCR_NZCV, D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
-                                        D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31];
-    let DecoderMethod = "DecodeLazyLoadStoreMul";
-}
-// T1: assembly contains the register list.
-// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
-def VLSTM : AXSI4FR<"vlstm${p}\t$Rn, $regs", 0, 0>,
+    let Inst{24-23} = 0b00;
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;
+    let Inst{20}    = 1;
+    let Inst{15-12} = 0;
+    let Inst{7-0}   = 0;
+    let mayLoad     = 1;
+    let Defs = [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, VPR, FPSCR, FPSCR_NZCV];
+}
+
+def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
+                  NoItinerary, "vlstm${p}\t$Rn", "", []>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
-    let Defs = [VPR, FPSCR, FPSCR_NZCV];
-    let Uses = [VPR, FPSCR, FPSCR_NZCV, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15];
-    let DecoderMethod = "DecodeLazyLoadStoreMul";
-}
-// T1: assembly does not contain the register list.
-def : InstAlias<"vlstm${p}\t$Rn", (VLSTM GPRnopc:$Rn, pred:$p, 0)>,
-                Requires<[HasV8MMainline, Has8MSecExt]>;
-// T2: assembly must contain the register list.
-// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
-def VLSTM_T2 : AXSI4FR<"vlstm${p}\t$Rn, $regs", 1, 0>,
-            Requires<[HasV8_1MMainline, Has8MSecExt]> {
-    let Defs = [VPR, FPSCR, FPSCR_NZCV];
-    let Uses = [VPR, FPSCR, FPSCR_NZCV, D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
-                                        D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31];
-    let DecoderMethod = "DecodeLazyLoadStoreMul";
+    let Inst{24-23} = 0b00;
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;
+    let Inst{20}    = 0;
+    let Inst{15-12} = 0;
+    let Inst{7-0}   = 0;
+    let mayStore    = 1;
 }
 
 def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r), 0>,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 5efbaf0d41060c..37bfb76a494dee 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -450,12 +450,11 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
                            unsigned ListNo);
 
-  int tryParseRegister(bool AllowOutofBoundReg = false);
+  int tryParseRegister();
   bool tryParseRegisterWithWriteBack(OperandVector &);
   int tryParseShiftRegister(OperandVector &);
   bool parseRegisterList(OperandVector &, bool EnforceOrder = true,
-                         bool AllowRAAC = false,
-                         bool AllowOutOfBoundReg = false);
+                         bool AllowRAAC = false);
   bool parseMemory(OperandVector &);
   bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parseImmExpr(int64_t &Out);
@@ -4073,7 +4072,7 @@ ParseStatus ARMAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
 /// Try to parse a register name.  The token must be an Identifier when called,
 /// and if it is a register name the token is eaten and the register number is
 /// returned.  Otherwise return -1.
-int ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) {
+int ARMAsmParser::tryParseRegister() {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) return -1;
@@ -4117,8 +4116,7 @@ int ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) {
   }
 
   // Some FPUs only have 16 D registers, so D16-D31 are invalid
-  if (!AllowOutOfBoundReg && !hasD32() && RegNum >= ARM::D16 &&
-      RegNum <= ARM::D31)
+  if (!hasD32() && RegNum >= ARM::D16 && RegNum <= ARM::D31)
     return -1;
 
   Parser.Lex(); // Eat identifier token.
@@ -4458,7 +4456,7 @@ insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
 
 /// Parse a register list.
 bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
-                                     bool AllowRAAC, bool AllowOutOfBoundReg) {
+                                     bool AllowRAAC) {
   MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::LCurly))
     return TokError("Token is not a Left Curly Brace");
@@ -4512,7 +4510,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
         return Error(RegLoc, "pseudo-register not allowed");
       Parser.Lex(); // Eat the minus.
       SMLoc AfterMinusLoc = Parser.getTok().getLoc();
-      int EndReg = tryParseRegister(AllowOutOfBoundReg);
+      int EndReg = tryParseRegister();
       if (EndReg == -1)
         return Error(AfterMinusLoc, "register expected");
       if (EndReg == ARM::RA_AUTH_CODE)
@@ -4547,7 +4545,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     RegLoc = Parser.getTok().getLoc();
     int OldReg = Reg;
     const AsmToken RegTok = Parser.getTok();
-    Reg = tryParseRegister(AllowOutOfBoundReg);
+    Reg = tryParseRegister();
     if (Reg == -1)
       return Error(RegLoc, "register expected");
     if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
@@ -6087,11 +6085,8 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   }
   case AsmToken::LBrac:
     return parseMemory(Operands);
-  case AsmToken::LCurly: {
-    bool AllowOutOfBoundReg = Mnemonic == "vlldm" || Mnemonic == "vlstm";
-    return parseRegisterList(Operands, !Mnemonic.starts_with("clr"), false,
-                             AllowOutOfBoundReg);
-  }
+  case AsmToken::LCurly:
+    return parseRegisterList(Operands, !Mnemonic.starts_with("clr"));
   case AsmToken::Dollar:
   case AsmToken::Hash: {
     // #42 -> immediate
@@ -7601,33 +7596,6 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
 
   const unsigned Opcode = Inst.getOpcode();
   switch (Opcode) {
-  case ARM::VLLDM:
-  case ARM::VLLDM_T2:
-  case ARM::VLSTM:
-  case ARM::VLSTM_T2: {
-    // Since in some cases both T1 and T2 are valid, tablegen can not always
-    // pick the correct instruction.
-    if (Operands.size() == 4) { // a register list has been provided
-      ARMOperand &Op = static_cast<ARMOperand &>(
-          *Operands[3]); // the register list, a dpr_reglist
-      assert(Op.isDPRRegList());
-      auto &RegList = Op.getRegList();
-      // T2 requires v8.1-M.Main (cannot be handled by tablegen)
-      if (RegList.size() == 32 && !hasV8_1MMainline()) {
-        return Error(Op.getEndLoc(), "T2 version requires v8.1-M.Main");
-      }
-      // When target has 32 D registers, T1 is undefined.
-      if (hasD32() && RegList.size() != 32) {
-        return Error(Op.getEndLoc(), "operand must be exactly {d0-d31}");
-      }
-      // When target has 16 D registers, both T1 and T2 are valid.
-      if (!hasD32() && (RegList.size() != 16 && RegList.size() != 32)) {
-        return Error(Op.getEndLoc(),
-                     "operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)");
-      }
-    }
-    return false;
-  }
   case ARM::t2IT: {
     // Encoding is unpredictable if it ever results in a notional 'NV'
     // predicate. Since we don't parse 'NV' directly this means an 'AL'
@@ -8763,32 +8731,6 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   }
 
   switch (Inst.getOpcode()) {
-  case ARM::VLLDM:
-  case ARM::VLSTM: {
-    // In some cases both T1 and T2 are valid, causing tablegen pick T1 instead
-    // of T2
-    if (Operands.size() == 4) { // a register list has been provided
-      ARMOperand &Op = static_cast<ARMOperand &>(
-          *Operands[3]); // the register list, a dpr_reglist
-      assert(Op.isDPRRegList());
-      auto &RegList = Op.getRegList();
-      // When the register list is {d0-d31} the instruction has to be the T2
-      // variant
-      if (RegList.size() == 32) {
-        const unsigned Opcode =
-            (Inst.getOpcode() == ARM::VLLDM) ? ARM::VLLDM_T2 : ARM::VLSTM_T2;
-        MCInst TmpInst;
-        TmpInst.setOpcode(Opcode);
-        TmpInst.addOperand(Inst.getOperand(0));
-        TmpInst.addOperand(Inst.getOperand(1));
-        TmpInst.addOperand(Inst.getOperand(2));
-        TmpInst.addOperand(Inst.getOperand(3));
-        Inst = TmpInst;
-        return true;
-      }
-    }
-    return false;
-  }
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
   case ARM::LDRBT_POST: {
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 705f3cbce12f02..604f22d7111900 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -700,9 +700,6 @@ DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
 static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
 
 #include "ARMGenDisassemblerTables.inc"
 
@@ -7033,23 +7030,3 @@ static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
 
   return DS;
 }
-
-static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder) {
-  DecodeStatus S = MCDisassembler::Success;
-
-  const unsigned Rn = fieldFromInstruction(Insn, 16, 4);
-  // Adding Rn, holding memory location to save/load to/from, the only argument
-  // that is being encoded.
-  // '$Rn' in the assembly.
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
-    return MCDisassembler::Fail;
-  // An optional predicate, '$p' in the assembly.
-  DecodePredicateOperand(Inst, ARMCC::AL, Address, Decoder);
-  // An immediate that represents a floating point registers list. '$regs' in
-  // the assembly.
-  Inst.addOperand(MCOperand::createImm(0)); // Arbitrary value, has no effect.
-
-  return S;
-}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index 24e627cd9a4e1f..fbd067d79af0b3 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -91,38 +91,6 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   unsigned Opcode = MI->getOpcode();
 
   switch (Opcode) {
-  case ARM::VLLDM: {
-    const MCOperand &Reg = MI->getOperand(0);
-    O << '\t' << "vlldm" << '\t';
-    printRegName(O, Reg.getReg());
-    O << ", "
-      << "{d0 - d15}";
-    return;
-  }
-  case ARM::VLLDM_T2: {
-    const MCOperand &Reg = MI->getOperand(0);
-    O << '\t' << "vlldm" << '\t';
-    printRegName(O, Reg.getReg());
-    O << ", "
-      << "{d0 - d31}";
-    return;
-  }
-  case ARM::VLSTM: {
-    const MCOperand &Reg = MI->getOperand(0);
-    O << '\t' << "vlstm" << '\t';
-    printRegName(O, Reg.getReg());
-    O << ", "
-      << "{d0 - d15}";
-    return;
-  }
-  case ARM::VLSTM_T2: {
-    const MCOperand &Reg = MI->getOperand(0);
-    O << '\t' << "vlstm" << '\t';
-    printRegName(O, Reg.getReg());
-    O << ", "
-      << "{d0 - d31}";
-    return;
-  }
   // Check for MOVs and print canonical forms, instead.
   case ARM::MOVsr: {
     // FIXME: Thumb variants?
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index bcaf4477749494..9773b2998c7dc4 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -185,7 +185,7 @@ def HasStdExtZabha : Predicate<"Subtarget->hasStdExtZabha()">,
                          "'Zabha' (Byte and Halfword Atomic Memory Operations)">;
 
 def FeatureStdExtZacas
-    : SubtargetFeature<"experimental-zacas", "HasStdExtZacas", "true",
+    : SubtargetFeature<"zacas", "HasStdExtZacas", "true",
                        "'Zacas' (Atomic Compare-And-Swap Instructions)">;
 def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">,
                      AssemblerPredicate<(all_of FeatureStdExtZacas),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dde1882f5eea83..e647f56416bfa6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1999,6 +1999,10 @@ bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const {
   case Intrinsic::vp_sdiv:
   case Intrinsic::vp_urem:
   case Intrinsic::vp_srem:
+  case Intrinsic::ssub_sat:
+  case Intrinsic::vp_ssub_sat:
+  case Intrinsic::usub_sat:
+  case Intrinsic::vp_usub_sat:
     return Operand == 1;
     // These intrinsics are commutative.
   case Intrinsic::vp_add:
@@ -2010,6 +2014,18 @@ bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const {
   case Intrinsic::vp_fmul:
   case Intrinsic::vp_icmp:
   case Intrinsic::vp_fcmp:
+  case Intrinsic::smin:
+  case Intrinsic::vp_smin:
+  case Intrinsic::umin:
+  case Intrinsic::vp_umin:
+  case Intrinsic::smax:
+  case Intrinsic::vp_smax:
+  case Intrinsic::umax:
+  case Intrinsic::vp_umax:
+  case Intrinsic::sadd_sat:
+  case Intrinsic::vp_sadd_sat:
+  case Intrinsic::uadd_sat:
+  case Intrinsic::vp_uadd_sat:
     // These intrinsics have 'vr' versions.
   case Intrinsic::vp_sub:
   case Intrinsic::vp_fsub:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index f04968d82e86e2..2e4e69fb4f920f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -488,9 +488,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         //   vmv.v.x v8, a0
         //   vmsne.vi v0, v8, 0
         return LT.first *
-               (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi
-                getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
-                                        LT.second, CostKind));
+               (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
+                                            LT.second, CostKind));
       }
       // Example sequence:
       //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored)
@@ -502,11 +501,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       //   vmsne.vi  v0, v8, 0
 
       return LT.first *
-             (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi
-              getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
-                                       RISCV::VMV_X_S, RISCV::VMV_V_X,
-                                       RISCV::VMSNE_VI},
-                                      LT.second, CostKind));
+             (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
+                                           RISCV::VMV_X_S, RISCV::VMV_V_X,
+                                           RISCV::VMSNE_VI},
+                                          LT.second, CostKind));
     }
 
     if (HasScalar) {
@@ -551,9 +549,12 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     if (LT.second.isFixedLengthVector())
       // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
       LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
-    // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX}
+    unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
+    if (LT.second.isFixedLengthVector() &&
+        isInt<5>(LT.second.getVectorNumElements() - 1))
+      Opcodes[1] = RISCV::VRSUB_VI;
     InstructionCost GatherCost =
-        2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
+        getRISCVInstructionCost(Opcodes, LT.second, CostKind);
     // Mask operation additionally required extend and truncate
     InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
     return LT.first * (LenCost + GatherCost + ExtendCost);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bec13d1c00ef7d..93088c7cde938b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5878,13 +5878,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       }
     }
 
-    // Peek through trunc/aext/zext.
+    // Peek through trunc/aext/zext/bitcast.
     // TODO: aext shouldn't require SM_SentinelZero padding.
     // TODO: handle shift of scalars.
     unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
     while (Scl.getOpcode() == ISD::TRUNCATE ||
            Scl.getOpcode() == ISD::ANY_EXTEND ||
-           Scl.getOpcode() == ISD::ZERO_EXTEND) {
+           Scl.getOpcode() == ISD::ZERO_EXTEND ||
+           (Scl.getOpcode() == ISD::BITCAST &&
+            Scl.getScalarValueSizeInBits() ==
+                Scl.getOperand(0).getScalarValueSizeInBits())) {
       Scl = Scl.getOperand(0);
       MinBitsPerElt =
           std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index 30da63b3feec65..7cc7cff0e6e857 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -14,7 +14,7 @@ define void  @vector_broadcast() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = shufflevector <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = shufflevector <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %8 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %8 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %9 = shufflevector <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %10 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %11 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
@@ -29,7 +29,7 @@ define void  @vector_broadcast() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = shufflevector <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = shufflevector <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %8 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %8 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %9 = shufflevector <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %10 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %11 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
@@ -78,20 +78,20 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_reverse() {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
index fc4a6b17d3f826..46bf3152ac5bd3 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
@@ -45,9 +45,9 @@ define void  @broadcast_scalable() #0{
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %38 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %39 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %40 = shufflevector <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %41 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %42 = shufflevector <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %43 = shufflevector <vscale x 64 x i1> undef, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %41 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %42 = shufflevector <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %43 = shufflevector <vscale x 64 x i1> undef, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'broadcast_scalable'
@@ -92,9 +92,9 @@ define void  @broadcast_scalable() #0{
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %38 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %39 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %40 = shufflevector <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %41 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %42 = shufflevector <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %43 = shufflevector <vscale x 64 x i1> undef, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %41 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %42 = shufflevector <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %43 = shufflevector <vscale x 64 x i1> undef, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %zero = shufflevector <vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
index 146909cc93df1c..e80dbe31683f3d 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
@@ -20,21 +20,21 @@ define void @reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reverse'
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-2-icmps-of-0-and-or.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-2-icmps-of-0-and-or.mir
new file mode 100644
index 00000000000000..2ce5c693f3dbca
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-2-icmps-of-0-and-or.mir
@@ -0,0 +1,1244 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# REQUIRES: asserts
+
+
+---
+name:            valid_and_eq_0_eq_0_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: valid_and_eq_0_eq_0_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR %x, %y
+    ; CHECK-NEXT: %and:_(s1) = G_ICMP intpred(eq), [[OR]](s32), %zero
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s32), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s32), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %and:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            invalid_and_eq_1_eq_0_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: invalid_and_eq_1_eq_0_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %one:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(eq), %x(s32), %one
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(eq), %y(s32), %zero
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %one:_(s32) = G_CONSTANT i32 1
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s32), %one:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s32), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %and:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            invalid_and_eq_0_eq_1_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: invalid_and_eq_0_eq_1_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %one:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(eq), %x(s32), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(eq), %y(s32), %one
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %one:_(s32) = G_CONSTANT i32 1
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s32), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s32), %one:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %and:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            invalid_and_ne_0_eq_0_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: invalid_and_ne_0_eq_0_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(ne), %x(s32), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(eq), %y(s32), %zero
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s32), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s32), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %and:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            invalid_and_eq_0_ne_0_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: invalid_and_eq_0_ne_0_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(eq), %x(s32), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(ne), %y(s32), %zero
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s32), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s32), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %and:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            invalid_and_ne_0_ne_0_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: invalid_and_ne_0_ne_0_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(ne), %x(s32), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(ne), %y(s32), %zero
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s32), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s32), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %and:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            valid_or_ne_0_ne_0_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: valid_or_ne_0_ne_0_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR %x, %y
+    ; CHECK-NEXT: %or:_(s1) = G_ICMP intpred(ne), [[OR]](s32), %zero
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %or(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s32), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s32), %zero:_
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %or:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            invalid_or_ne_1_ne_0_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: invalid_or_ne_1_ne_0_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %one:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(ne), %x(s32), %one
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(ne), %y(s32), %zero
+    ; CHECK-NEXT: %or:_(s1) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %or(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %one:_(s32) = G_CONSTANT i32 1
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s32), %one:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s32), %zero:_
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %or:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            invalid_or_ne_0_ne_1_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: invalid_or_ne_0_ne_1_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %one:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(ne), %x(s32), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(ne), %y(s32), %one
+    ; CHECK-NEXT: %or:_(s1) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %or(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %one:_(s32) = G_CONSTANT i32 1
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s32), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s32), %one:_
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %or:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            invalid_or_eq_0_ne_0_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: invalid_or_eq_0_ne_0_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(eq), %x(s32), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(ne), %y(s32), %zero
+    ; CHECK-NEXT: %or:_(s1) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %or(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s32), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s32), %zero:_
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %or:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            invalid_or_ne_0_eq_0_s32
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: invalid_or_ne_0_eq_0_s32
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s32) = COPY $w1
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(ne), %x(s32), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(eq), %y(s32), %zero
+    ; CHECK-NEXT: %or:_(s1) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %or(s1)
+    ; CHECK-NEXT: $w0 = COPY %zext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %y:_(s32) = COPY $w1
+    %zero:_(s32) = G_CONSTANT i32 0
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s32), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s32), %zero:_
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s32) = G_ZEXT %or:_(s1)
+    $w0 = COPY %zext
+    RET_ReallyLR implicit $w0
+
+...
+
+---
+name:            valid_and_eq_0_eq_0_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: valid_and_eq_0_eq_0_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR %x, %y
+    ; CHECK-NEXT: %and:_(s1) = G_ICMP intpred(eq), [[OR]](s64), %zero
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s64), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s64), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %and:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_and_eq_1_eq_0_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_and_eq_1_eq_0_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %one:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(eq), %x(s64), %one
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(eq), %y(s64), %zero
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %one:_(s64) = G_CONSTANT i64 1
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s64), %one:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s64), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %and:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_and_eq_0_eq_1_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_and_eq_0_eq_1_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %one:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(eq), %x(s64), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(eq), %y(s64), %one
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %one:_(s64) = G_CONSTANT i64 1
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s64), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s64), %one:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %and:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_and_ne_0_eq_0_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_and_ne_0_eq_0_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(ne), %x(s64), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(eq), %y(s64), %zero
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s64), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s64), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %and:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_and_eq_0_ne_0_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_and_eq_0_ne_0_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(eq), %x(s64), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(ne), %y(s64), %zero
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s64), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s64), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %and:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_and_ne_0_ne_0_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_and_ne_0_ne_0_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(ne), %x(s64), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(ne), %y(s64), %zero
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s64), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s64), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %and:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            valid_or_ne_0_ne_0_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: valid_or_ne_0_ne_0_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR %x, %y
+    ; CHECK-NEXT: %or:_(s1) = G_ICMP intpred(ne), [[OR]](s64), %zero
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %or(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s64), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s64), %zero:_
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %or:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_or_ne_1_ne_0_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_or_ne_1_ne_0_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %one:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(ne), %x(s64), %one
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(ne), %y(s64), %zero
+    ; CHECK-NEXT: %or:_(s1) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %or(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %one:_(s64) = G_CONSTANT i64 1
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s64), %one:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s64), %zero:_
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %or:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_or_ne_0_ne_1_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_or_ne_0_ne_1_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %one:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(ne), %x(s64), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(ne), %y(s64), %one
+    ; CHECK-NEXT: %or:_(s1) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %or(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %one:_(s64) = G_CONSTANT i64 1
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s64), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s64), %one:_
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %or:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_or_eq_0_ne_0_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_or_eq_0_ne_0_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(eq), %x(s64), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(ne), %y(s64), %zero
+    ; CHECK-NEXT: %or:_(s1) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %or(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s64), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(ne), %y:_(s64), %zero:_
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %or:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_or_ne_0_eq_0_s64
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_or_ne_0_eq_0_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s64) = COPY $x0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(ne), %x(s64), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(eq), %y(s64), %zero
+    ; CHECK-NEXT: %or:_(s1) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %or(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %y:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %cmp1:_(s1) = G_ICMP intpred(ne), %x:_(s64), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s64), %zero:_
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %or:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            valid_and_eq_0_eq_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: valid_and_eq_0_eq_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %zero_scalar:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR %x, %y
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_ICMP intpred(eq), [[OR]](<2 x s32>), %zero
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %zero_scalar:_(s32) = G_CONSTANT i32 0
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x:_(<2 x s32>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y:_(<2 x s32>), %zero:_
+    %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_and_eq_non_0_eq_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_and_eq_non_0_eq_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %scalar0:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %scalar1:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar0(s32)
+    ; CHECK-NEXT: %non_zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar1(s32)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x(<2 x s32>), %non_zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y(<2 x s32>), %zero
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %scalar0:_(s32) = G_CONSTANT i32 0
+    %scalar1:_(s32) = G_CONSTANT i32 1
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar0(s32)
+    %non_zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar1(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x:_(<2 x s32>), %non_zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y:_(<2 x s32>), %zero:_
+    %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_and_eq_0_eq_non_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_and_eq_0_eq_non_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %scalar0:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %scalar1:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar0(s32)
+    ; CHECK-NEXT: %non_zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar1(s32)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x(<2 x s32>), %zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y(<2 x s32>), %non_zero
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %scalar0:_(s32) = G_CONSTANT i32 0
+    %scalar1:_(s32) = G_CONSTANT i32 1
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar0(s32)
+    %non_zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar1(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x:_(<2 x s32>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y:_(<2 x s32>), %non_zero:_
+    %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_and_ne_0_eq_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_and_ne_0_eq_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %zero_scalar:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x(<2 x s32>), %zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y(<2 x s32>), %zero
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %zero_scalar:_(s32) = G_CONSTANT i32 0
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x:_(<2 x s32>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y:_(<2 x s32>), %zero:_
+    %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_and_eq_0_ne_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_and_eq_0_ne_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %zero_scalar:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x(<2 x s32>), %zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y(<2 x s32>), %zero
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %zero_scalar:_(s32) = G_CONSTANT i32 0
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x:_(<2 x s32>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y:_(<2 x s32>), %zero:_
+    %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_and_ne_0_ne_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_and_ne_0_ne_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %zero_scalar:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x(<2 x s32>), %zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y(<2 x s32>), %zero
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %zero_scalar:_(s32) = G_CONSTANT i32 0
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x:_(<2 x s32>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y:_(<2 x s32>), %zero:_
+    %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            valid_or_ne_0_ne_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: valid_or_ne_0_ne_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %zero_scalar:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR %x, %y
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_ICMP intpred(ne), [[OR]](<2 x s32>), %zero
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %zero_scalar:_(s32) = G_CONSTANT i32 0
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x:_(<2 x s32>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y:_(<2 x s32>), %zero:_
+    %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_or_ne_non_0_ne_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_or_ne_non_0_ne_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %scalar0:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %scalar1:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar0(s32)
+    ; CHECK-NEXT: %non_zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar1(s32)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x(<2 x s32>), %non_zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y(<2 x s32>), %zero
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %scalar0:_(s32) = G_CONSTANT i32 0
+    %scalar1:_(s32) = G_CONSTANT i32 1
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar0(s32)
+    %non_zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar1(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x:_(<2 x s32>), %non_zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y:_(<2 x s32>), %zero:_
+    %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_or_ne_0_ne_non_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_or_ne_0_ne_non_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %scalar0:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %scalar1:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar0(s32)
+    ; CHECK-NEXT: %non_zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar1(s32)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x(<2 x s32>), %zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y(<2 x s32>), %non_zero
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %scalar0:_(s32) = G_CONSTANT i32 0
+    %scalar1:_(s32) = G_CONSTANT i32 1
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar0(s32)
+    %non_zero:_(<2 x s32>) = G_BUILD_VECTOR %scalar0(s32), %scalar1(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x:_(<2 x s32>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y:_(<2 x s32>), %non_zero:_
+    %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_or_eq_0_ne_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_or_eq_0_ne_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %zero_scalar:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x(<2 x s32>), %zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y(<2 x s32>), %zero
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %zero_scalar:_(s32) = G_CONSTANT i32 0
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x:_(<2 x s32>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(ne), %y:_(<2 x s32>), %zero:_
+    %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_or_ne_0_eq_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_or_ne_0_eq_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %zero_scalar:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x(<2 x s32>), %zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y(<2 x s32>), %zero
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %zero_scalar:_(s32) = G_CONSTANT i32 0
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(ne), %x:_(<2 x s32>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y:_(<2 x s32>), %zero:_
+    %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_or_eq_0_eq_0_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_or_eq_0_eq_0_vec
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s32>) = COPY $x1
+    ; CHECK-NEXT: %zero_scalar:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x(<2 x s32>), %zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y(<2 x s32>), %zero
+    ; CHECK-NEXT: %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %and(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s32>) = COPY $x1
+    %zero_scalar:_(s32) = G_CONSTANT i32 0
+    %zero:_(<2 x s32>) = G_BUILD_VECTOR %zero_scalar(s32), %zero_scalar(s32)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x:_(<2 x s32>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y:_(<2 x s32>), %zero:_
+    %and:_(<2 x s1>) = G_OR %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %and:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_p0_src
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: invalid_p0_src
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(p0) = COPY $x0
+    ; CHECK-NEXT: %y:_(p0) = COPY $x1
+    ; CHECK-NEXT: %zero:_(p0) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(eq), %x(p0), %zero
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(eq), %y(p0), %zero
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(p0) = COPY $x0
+    %y:_(p0) = COPY $x1
+    %zero:_(p0) = G_CONSTANT i64 0
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(p0), %zero:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(p0), %zero:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %and:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_p0_src_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: invalid_p0_src_vec
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x p0>) = COPY $q0
+    ; CHECK-NEXT: %y:_(<2 x p0>) = COPY $q1
+    ; CHECK-NEXT: %scalar0:_(p0) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %zero:_(<2 x p0>) = G_BUILD_VECTOR %scalar0(p0), %scalar0(p0)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x(<2 x p0>), %zero
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y(<2 x p0>), %zero
+    ; CHECK-NEXT: %or:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s64>) = G_ZEXT %or(<2 x s1>)
+    ; CHECK-NEXT: $q0 = COPY %zext(<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %x:_(<2 x p0>) = COPY $q0
+    %y:_(<2 x p0>) = COPY $q1
+    %scalar0:_(p0) = G_CONSTANT i64 0
+    %zero:_(<2 x p0>) = G_BUILD_VECTOR %scalar0(p0), %scalar0(p0)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x:_(<2 x p0>), %zero:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y:_(<2 x p0>), %zero:_
+    %or:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    %zext:_(<2 x s64>) = G_ZEXT %or:_(<2 x s1>)
+    $q0 = COPY %zext
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            invalid_diff_src_ty
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $w0, $x1
+
+    ; CHECK-LABEL: name: invalid_diff_src_ty
+    ; CHECK: liveins: $w0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(s32) = COPY $w0
+    ; CHECK-NEXT: %y:_(s64) = COPY $x1
+    ; CHECK-NEXT: %zero_s32:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %zero_s64:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %cmp1:_(s1) = G_ICMP intpred(eq), %x(s32), %zero_s32
+    ; CHECK-NEXT: %cmp2:_(s1) = G_ICMP intpred(eq), %y(s64), %zero_s64
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s32) = COPY $w0
+    %y:_(s64) = COPY $x1
+    %zero_s32:_(s32) = G_CONSTANT i32 0
+    %zero_s64:_(s64) = G_CONSTANT i64 0
+    %cmp1:_(s1) = G_ICMP intpred(eq), %x:_(s32), %zero_s32:_
+    %cmp2:_(s1) = G_ICMP intpred(eq), %y:_(s64), %zero_s64:_
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %and:_(s1)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            invalid_diff_src_ty_vec
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $x0, $q1
+
+    ; CHECK-LABEL: name: invalid_diff_src_ty_vec
+    ; CHECK: liveins: $x0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %y:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: %scalar0s32:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %scalar0s64:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %zero_s32:_(<2 x s32>) = G_BUILD_VECTOR %scalar0s32(s32), %scalar0s32(s32)
+    ; CHECK-NEXT: %zero_s64:_(<2 x s64>) = G_BUILD_VECTOR %scalar0s64(s64), %scalar0s64(s64)
+    ; CHECK-NEXT: %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x(<2 x s32>), %zero_s32
+    ; CHECK-NEXT: %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y(<2 x s64>), %zero_s64
+    ; CHECK-NEXT: %or:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(<2 x s32>) = G_ZEXT %or(<2 x s1>)
+    ; CHECK-NEXT: $x0 = COPY %zext(<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(<2 x s32>) = COPY $x0
+    %y:_(<2 x s64>) = COPY $q1
+    %scalar0s32:_(s32) = G_CONSTANT i32 0
+    %scalar0s64:_(s64) = G_CONSTANT i64 0
+    %zero_s32:_(<2 x s32>) = G_BUILD_VECTOR %scalar0s32(s32), %scalar0s32(s32)
+    %zero_s64:_(<2 x s64>) = G_BUILD_VECTOR %scalar0s64(s64), %scalar0s64(s64)
+    %cmp1:_(<2 x s1>) = G_ICMP intpred(eq), %x:_(<2 x s32>), %zero_s32:_
+    %cmp2:_(<2 x s1>) = G_ICMP intpred(eq), %y:_(<2 x s64>), %zero_s64:_
+    %or:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    %zext:_(<2 x s32>) = G_ZEXT %or:_(<2 x s1>)
+    $x0 = COPY %zext
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
index 09898f1442fb82..38640a18b5aee6 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -152,7 +152,7 @@ define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(ptr addrspace(1) %out) #
 
 ; GCN-LABEL: {{^}}constant_fold_fmax_f32_n0_p0:
 ; GCN-NOT: v_max_f32_e32
-; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
 define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(ptr addrspace(1) %out) #0 {
   %val = call float @llvm.maxnum.f32(float -0.0, float 0.0)
diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.ll b/llvm/test/CodeGen/AMDGPU/fminnum.ll
index 844d26a6225b40..65b311845a6b77 100644
--- a/llvm/test/CodeGen/AMDGPU/fminnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminnum.ll
@@ -150,7 +150,7 @@ define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(ptr addrspace(1) %out) #
 
 ; GCN-LABEL: {{^}}constant_fold_fmin_f32_p0_n0:
 ; GCN-NOT: v_min_f32_e32
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
 define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(ptr addrspace(1) %out) #0 {
   %val = call float @llvm.minnum.f32(float 0.0, float -0.0)
diff --git a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
index d006aa9ba38e84..2bc4288884f192 100644
--- a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
+++ b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
@@ -89,7 +89,7 @@ body:             |
 # CHECK: $sp = t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, $r4, $r5, $r6, undef $r7, $r8, $r9, $r10, $r11
 # CHECK-NEXT:  $r0 = t2BICri $r0, 1, 14 /* CC::al */, $noreg, $noreg
 # CHECK-NEXT:  $sp = tSUBspi $sp, 34, 14 /* CC::al */, $noreg
-# CHECK-NEXT:  VLSTM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $d0, implicit undef $d1, implicit undef $d2, implicit undef $d3, implicit undef $d4, implicit undef $d5, implicit undef $d6, implicit undef $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15
+# CHECK-NEXT:  VLSTM $sp, 14 /* CC::al */, $noreg, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $q0, implicit undef $q1, implicit undef $q2, implicit undef $q3, implicit undef $q4, implicit undef $q5, implicit undef $q6, implicit undef $q7
 # CHECK-NEXT:  $r1 = tMOVr $r0, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $r2 = tMOVr $r0, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
@@ -105,7 +105,7 @@ body:             |
 # CHECK-NEXT:  t2MSR_M 3072, $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
 # CHECK-NEXT:  tBLXNSr 14 /* CC::al */, $noreg, killed $r0, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $s0
 # CHECK-NEXT:  $r12 = VMOVRS $s0, 14 /* CC::al */, $noreg
-# CHECK-NEXT:  VLLDM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit-def $d0, implicit-def $d1, implicit-def $d2, implicit-def $d3, implicit-def $d4, implicit-def $d5, implicit-def $d6, implicit-def $d7, implicit-def $d8, implicit-def $d9, implicit-def $d10, implicit-def $d11, implicit-def $d12, implicit-def $d13, implicit-def $d14, implicit-def $d15
+# CHECK-NEXT:  VLLDM $sp, 14 /* CC::al */, $noreg, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv
 # CHECK-NEXT:  $s0 = VMOVSR $r12, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $sp = tADDspi $sp, 34, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11
diff --git a/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir b/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
index ad53addcc21a35..8c49a531674115 100644
--- a/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
+++ b/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
@@ -2,7 +2,7 @@
 --- |
   target triple = "thumbv8m.main-arm-none-eabi"
 
-  define hidden void @foo(void ()* nocapture %baz) local_unnamed_addr #0 {
+  define hidden void @foo(ptr nocapture %baz) local_unnamed_addr #0 {
   entry:
     %call = call i32 @bar() #0
     %tobool = icmp eq i32 %call, 0
@@ -55,13 +55,14 @@ body:             |
     tBL 14, $noreg, @bar, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $r0
 
   bb.2.land.end:
-    liveins: $r4, $vpr, $fpscr, $fpscr_nzcv, $d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7, $d8, $d9, $d10, $d11, $d12, $d13, $d14, $d15
+    liveins: $r4
+
     $sp = t2STMDB_UPD $sp, 14, $noreg, $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11
     $r4 = t2BICri $r4, 1, 14, $noreg, $noreg
     $sp = tSUBspi $sp, 34, 14, $noreg
-    VLSTM $sp, 14, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit $vpr, implicit $fpscr, implicit $fpscr_nzcv, implicit $d0, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15
+    VLSTM $sp, 14, $noreg
     tBLXNSr 14, $noreg, killed $r4, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp
-    VLLDM $sp, 14, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit-def $d0, implicit-def $d1, implicit-def $d2, implicit-def $d3, implicit-def $d4, implicit-def $d5, implicit-def $d6, implicit-def $d7, implicit-def $d8, implicit-def $d9, implicit-def $d10, implicit-def $d11, implicit-def $d12, implicit-def $d13, implicit-def $d14, implicit-def $d15
+    VLLDM $sp, 14, $noreg, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv
     $sp = tADDspi $sp, 34, 14, $noreg
     $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11
     $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $pc
diff --git a/llvm/test/CodeGen/PowerPC/crsave.ll b/llvm/test/CodeGen/PowerPC/crsave.ll
index 81e7a0adcc8ca1..bde49d02e86ebd 100644
--- a/llvm/test/CodeGen/PowerPC/crsave.ll
+++ b/llvm/test/CodeGen/PowerPC/crsave.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -O0 -frame-pointer=all -mtriple=powerpc-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC32
 ; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC64
 ; RUN: llc -O0 -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s -check-prefix=PPC64-ELFv2
@@ -5,6 +6,101 @@
 declare void @foo()
 
 define i32 @test_cr2() nounwind uwtable {
+; PPC32-LABEL: test_cr2:
+; PPC32:       # %bb.0: # %entry
+; PPC32-NEXT:    mflr 0
+; PPC32-NEXT:    stwu 1, -32(1)
+; PPC32-NEXT:    stw 31, 28(1)
+; PPC32-NEXT:    stw 0, 36(1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 32
+; PPC32-NEXT:    .cfi_offset r31, -4
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    mr 31, 1
+; PPC32-NEXT:    .cfi_def_cfa_register r31
+; PPC32-NEXT:    mfcr 12
+; PPC32-NEXT:    stw 12, 24(31)
+; PPC32-NEXT:    li 3, 1
+; PPC32-NEXT:    li 4, 2
+; PPC32-NEXT:    li 5, 3
+; PPC32-NEXT:    li 6, 0
+; PPC32-NEXT:    #APP
+; PPC32-EMPTY:
+; PPC32-NEXT:    mtcr 6
+; PPC32-NEXT:    cmpw 2, 4, 3
+; PPC32-NEXT:    mfcr 3
+; PPC32-NEXT:    #NO_APP
+; PPC32-NEXT:    stw 3, 20(31)
+; PPC32-NEXT:    bl foo
+; PPC32-NEXT:    lwz 3, 20(31)
+; PPC32-NEXT:    lwz 12, 24(31)
+; PPC32-NEXT:    mtocrf 32, 12
+; PPC32-NEXT:    lwz 0, 36(1)
+; PPC32-NEXT:    lwz 31, 28(1)
+; PPC32-NEXT:    addi 1, 1, 32
+; PPC32-NEXT:    mtlr 0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: test_cr2:
+; PPC64:       # %bb.0: # %entry
+; PPC64-NEXT:    mflr 0
+; PPC64-NEXT:    mfcr 12
+; PPC64-NEXT:    stw 12, 8(1)
+; PPC64-NEXT:    stdu 1, -128(1)
+; PPC64-NEXT:    std 0, 144(1)
+; PPC64-NEXT:    .cfi_def_cfa_offset 128
+; PPC64-NEXT:    .cfi_offset lr, 16
+; PPC64-NEXT:    .cfi_offset cr2, 8
+; PPC64-NEXT:    li 3, 1
+; PPC64-NEXT:    li 4, 2
+; PPC64-NEXT:    li 5, 3
+; PPC64-NEXT:    li 6, 0
+; PPC64-NEXT:    #APP
+; PPC64-EMPTY:
+; PPC64-NEXT:    mtcr 6
+; PPC64-NEXT:    cmpw 2, 4, 3
+; PPC64-NEXT:    mfcr 3
+; PPC64-NEXT:    #NO_APP
+; PPC64-NEXT:    stw 3, 124(1)
+; PPC64-NEXT:    bl foo
+; PPC64-NEXT:    nop
+; PPC64-NEXT:    lwz 3, 124(1)
+; PPC64-NEXT:    addi 1, 1, 128
+; PPC64-NEXT:    ld 0, 16(1)
+; PPC64-NEXT:    lwz 12, 8(1)
+; PPC64-NEXT:    mtocrf 32, 12
+; PPC64-NEXT:    mtlr 0
+; PPC64-NEXT:    blr
+;
+; PPC64-ELFv2-LABEL: test_cr2:
+; PPC64-ELFv2:       # %bb.0: # %entry
+; PPC64-ELFv2-NEXT:    mflr 0
+; PPC64-ELFv2-NEXT:    mfocrf 12, 32
+; PPC64-ELFv2-NEXT:    stw 12, 8(1)
+; PPC64-ELFv2-NEXT:    stdu 1, -112(1)
+; PPC64-ELFv2-NEXT:    std 0, 128(1)
+; PPC64-ELFv2-NEXT:    .cfi_def_cfa_offset 112
+; PPC64-ELFv2-NEXT:    .cfi_offset lr, 16
+; PPC64-ELFv2-NEXT:    .cfi_offset cr2, 8
+; PPC64-ELFv2-NEXT:    li 3, 1
+; PPC64-ELFv2-NEXT:    li 4, 2
+; PPC64-ELFv2-NEXT:    li 5, 3
+; PPC64-ELFv2-NEXT:    li 6, 0
+; PPC64-ELFv2-NEXT:    #APP
+; PPC64-ELFv2-EMPTY:
+; PPC64-ELFv2-NEXT:    mtcr 6
+; PPC64-ELFv2-NEXT:    cmpw 2, 4, 3
+; PPC64-ELFv2-NEXT:    mfcr 3
+; PPC64-ELFv2-NEXT:    #NO_APP
+; PPC64-ELFv2-NEXT:    stw 3, 108(1)
+; PPC64-ELFv2-NEXT:    bl foo
+; PPC64-ELFv2-NEXT:    nop
+; PPC64-ELFv2-NEXT:    lwz 3, 108(1)
+; PPC64-ELFv2-NEXT:    addi 1, 1, 112
+; PPC64-ELFv2-NEXT:    ld 0, 16(1)
+; PPC64-ELFv2-NEXT:    lwz 12, 8(1)
+; PPC64-ELFv2-NEXT:    mtocrf 32, 12
+; PPC64-ELFv2-NEXT:    mtlr 0
+; PPC64-ELFv2-NEXT:    blr
 entry:
   %ret = alloca i32, align 4
   %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmpw 2,$2,$1\0A\09mfcr $0", "=r,r,r,r,r,~{cr2}"(i32 1, i32 2, i32 3, i32 0) nounwind
@@ -14,27 +110,104 @@ entry:
   ret i32 %1
 }
 
-; PPC32-LABEL: test_cr2:
-; PPC32: stwu 1, -32(1)
-; PPC32: stw 31, 28(1)
-; PPC32: mfcr 12
-; PPC32-NEXT: stw 12, 24(31)
-; PPC32: lwz 12, 24(31)
-; PPC32-NEXT: mtocrf 32, 12
-
-; PPC64: .cfi_startproc
-; PPC64: mfcr 12
-; PPC64: stw 12, 8(1)
-; PPC64: stdu 1, -[[AMT:[0-9]+]](1)
-; PPC64: .cfi_def_cfa_offset 128
-; PPC64: .cfi_offset lr, 16
-; PPC64: .cfi_offset cr2, 8
-; PPC64: addi 1, 1, [[AMT]]
-; PPC64: lwz 12, 8(1)
-; PPC64: mtocrf 32, 12
-; PPC64: .cfi_endproc
-
 define i32 @test_cr234() nounwind {
+; PPC32-LABEL: test_cr234:
+; PPC32:       # %bb.0: # %entry
+; PPC32-NEXT:    mflr 0
+; PPC32-NEXT:    stwu 1, -32(1)
+; PPC32-NEXT:    stw 31, 28(1)
+; PPC32-NEXT:    stw 0, 36(1)
+; PPC32-NEXT:    mr 31, 1
+; PPC32-NEXT:    mfcr 12
+; PPC32-NEXT:    stw 12, 24(31)
+; PPC32-NEXT:    li 3, 1
+; PPC32-NEXT:    li 4, 2
+; PPC32-NEXT:    li 5, 3
+; PPC32-NEXT:    li 6, 0
+; PPC32-NEXT:    #APP
+; PPC32-EMPTY:
+; PPC32-NEXT:    mtcr 6
+; PPC32-NEXT:    cmpw 2, 4, 3
+; PPC32-NEXT:    cmpw 3, 4, 4
+; PPC32-NEXT:    cmpw 4, 4, 5
+; PPC32-NEXT:    mfcr 3
+; PPC32-NEXT:    #NO_APP
+; PPC32-NEXT:    stw 3, 20(31)
+; PPC32-NEXT:    bl foo
+; PPC32-NEXT:    lwz 3, 20(31)
+; PPC32-NEXT:    lwz 12, 24(31)
+; PPC32-NEXT:    mtocrf 32, 12
+; PPC32-NEXT:    mtocrf 16, 12
+; PPC32-NEXT:    mtocrf 8, 12
+; PPC32-NEXT:    lwz 0, 36(1)
+; PPC32-NEXT:    lwz 31, 28(1)
+; PPC32-NEXT:    addi 1, 1, 32
+; PPC32-NEXT:    mtlr 0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: test_cr234:
+; PPC64:       # %bb.0: # %entry
+; PPC64-NEXT:    mflr 0
+; PPC64-NEXT:    mfcr 12
+; PPC64-NEXT:    stw 12, 8(1)
+; PPC64-NEXT:    stdu 1, -128(1)
+; PPC64-NEXT:    std 0, 144(1)
+; PPC64-NEXT:    li 3, 1
+; PPC64-NEXT:    li 4, 2
+; PPC64-NEXT:    li 5, 3
+; PPC64-NEXT:    li 6, 0
+; PPC64-NEXT:    #APP
+; PPC64-EMPTY:
+; PPC64-NEXT:    mtcr 6
+; PPC64-NEXT:    cmpw 2, 4, 3
+; PPC64-NEXT:    cmpw 3, 4, 4
+; PPC64-NEXT:    cmpw 4, 4, 5
+; PPC64-NEXT:    mfcr 3
+; PPC64-NEXT:    #NO_APP
+; PPC64-NEXT:    stw 3, 124(1)
+; PPC64-NEXT:    bl foo
+; PPC64-NEXT:    nop
+; PPC64-NEXT:    lwz 3, 124(1)
+; PPC64-NEXT:    addi 1, 1, 128
+; PPC64-NEXT:    ld 0, 16(1)
+; PPC64-NEXT:    lwz 12, 8(1)
+; PPC64-NEXT:    mtocrf 32, 12
+; PPC64-NEXT:    mtocrf 16, 12
+; PPC64-NEXT:    mtocrf 8, 12
+; PPC64-NEXT:    mtlr 0
+; PPC64-NEXT:    blr
+;
+; PPC64-ELFv2-LABEL: test_cr234:
+; PPC64-ELFv2:       # %bb.0: # %entry
+; PPC64-ELFv2-NEXT:    mflr 0
+; PPC64-ELFv2-NEXT:    mfcr 12
+; PPC64-ELFv2-NEXT:    stw 12, 8(1)
+; PPC64-ELFv2-NEXT:    stdu 1, -112(1)
+; PPC64-ELFv2-NEXT:    std 0, 128(1)
+; PPC64-ELFv2-NEXT:    li 3, 1
+; PPC64-ELFv2-NEXT:    li 4, 2
+; PPC64-ELFv2-NEXT:    li 5, 3
+; PPC64-ELFv2-NEXT:    li 6, 0
+; PPC64-ELFv2-NEXT:    #APP
+; PPC64-ELFv2-EMPTY:
+; PPC64-ELFv2-NEXT:    mtcr 6
+; PPC64-ELFv2-NEXT:    cmpw 2, 4, 3
+; PPC64-ELFv2-NEXT:    cmpw 3, 4, 4
+; PPC64-ELFv2-NEXT:    cmpw 4, 4, 5
+; PPC64-ELFv2-NEXT:    mfcr 3
+; PPC64-ELFv2-NEXT:    #NO_APP
+; PPC64-ELFv2-NEXT:    stw 3, 108(1)
+; PPC64-ELFv2-NEXT:    bl foo
+; PPC64-ELFv2-NEXT:    nop
+; PPC64-ELFv2-NEXT:    lwz 3, 108(1)
+; PPC64-ELFv2-NEXT:    addi 1, 1, 112
+; PPC64-ELFv2-NEXT:    ld 0, 16(1)
+; PPC64-ELFv2-NEXT:    lwz 12, 8(1)
+; PPC64-ELFv2-NEXT:    mtocrf 32, 12
+; PPC64-ELFv2-NEXT:    mtocrf 16, 12
+; PPC64-ELFv2-NEXT:    mtocrf 8, 12
+; PPC64-ELFv2-NEXT:    mtlr 0
+; PPC64-ELFv2-NEXT:    blr
 entry:
   %ret = alloca i32, align 4
   %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmpw 2,$2,$1\0A\09cmpw 3,$2,$2\0A\09cmpw 4,$2,$3\0A\09mfcr $0", "=r,r,r,r,r,~{cr2},~{cr3},~{cr4}"(i32 1, i32 2, i32 3, i32 0) nounwind
@@ -44,41 +217,102 @@ entry:
   ret i32 %1
 }
 
-; PPC32-LABEL: test_cr234:
-; PPC32: stwu 1, -32(1)
-; PPC32: stw 31, 28(1)
-; PPC32: mfcr 12
-; PPC32-NEXT: stw 12, 24(31)
-; PPC32: lwz 12, 24(31)
-; PPC32-NEXT: mtocrf 32, 12
-; PPC32-NEXT: mtocrf 16, 12
-; PPC32-NEXT: mtocrf 8, 12
-
-; PPC64: mfcr 12
-; PPC64: stw 12, 8(1)
-; PPC64: stdu 1, -[[AMT:[0-9]+]](1)
-; PPC64: addi 1, 1, [[AMT]]
-; PPC64: lwz 12, 8(1)
-; PPC64: mtocrf 32, 12
-; PPC64: mtocrf 16, 12
-; PPC64: mtocrf 8, 12
-
 ; Generate mfocrf in prologue when we need to save 1 nonvolatile CR field
 define void @cloberOneNvCrField() {
+; PPC32-LABEL: cloberOneNvCrField:
+; PPC32:       # %bb.0: # %entry
+; PPC32-NEXT:    stwu 1, -32(1)
+; PPC32-NEXT:    stw 31, 28(1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 32
+; PPC32-NEXT:    .cfi_offset r31, -4
+; PPC32-NEXT:    mr 31, 1
+; PPC32-NEXT:    .cfi_def_cfa_register r31
+; PPC32-NEXT:    mfcr 12
+; PPC32-NEXT:    stw 12, 24(31)
+; PPC32-NEXT:    #APP
+; PPC32-NEXT:    # clobbers
+; PPC32-NEXT:    #NO_APP
+; PPC32-NEXT:    lwz 12, 24(31)
+; PPC32-NEXT:    mtocrf 32, 12
+; PPC32-NEXT:    lwz 31, 28(1)
+; PPC32-NEXT:    addi 1, 1, 32
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: cloberOneNvCrField:
+; PPC64:       # %bb.0: # %entry
+; PPC64-NEXT:    mfcr 12
+; PPC64-NEXT:    stw 12, 8(1)
+; PPC64-NEXT:    #APP
+; PPC64-NEXT:    # clobbers
+; PPC64-NEXT:    #NO_APP
+; PPC64-NEXT:    lwz 12, 8(1)
+; PPC64-NEXT:    mtocrf 32, 12
+; PPC64-NEXT:    blr
+;
+; PPC64-ELFv2-LABEL: cloberOneNvCrField:
+; PPC64-ELFv2:       # %bb.0: # %entry
+; PPC64-ELFv2-NEXT:    mfocrf 12, 32
+; PPC64-ELFv2-NEXT:    stw 12, 8(1)
+; PPC64-ELFv2-NEXT:    #APP
+; PPC64-ELFv2-NEXT:    # clobbers
+; PPC64-ELFv2-NEXT:    #NO_APP
+; PPC64-ELFv2-NEXT:    lwz 12, 8(1)
+; PPC64-ELFv2-NEXT:    mtocrf 32, 12
+; PPC64-ELFv2-NEXT:    blr
 entry:
   tail call void asm sideeffect "# clobbers", "~{cr2}"()
   ret void
-
-; PPC64-ELFv2-LABEL: @cloberOneNvCrField
-; PPC64-ELFv2: mfocrf [[REG1:[0-9]+]], 32
 }
 
 ; Generate mfcr in prologue when we need to save all nonvolatile CR field
 define void @cloberAllNvCrField() {
+; PPC32-LABEL: cloberAllNvCrField:
+; PPC32:       # %bb.0: # %entry
+; PPC32-NEXT:    stwu 1, -32(1)
+; PPC32-NEXT:    stw 31, 28(1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 32
+; PPC32-NEXT:    .cfi_offset r31, -4
+; PPC32-NEXT:    mr 31, 1
+; PPC32-NEXT:    .cfi_def_cfa_register r31
+; PPC32-NEXT:    mfcr 12
+; PPC32-NEXT:    stw 12, 24(31)
+; PPC32-NEXT:    #APP
+; PPC32-NEXT:    # clobbers
+; PPC32-NEXT:    #NO_APP
+; PPC32-NEXT:    lwz 12, 24(31)
+; PPC32-NEXT:    mtocrf 32, 12
+; PPC32-NEXT:    mtocrf 16, 12
+; PPC32-NEXT:    mtocrf 8, 12
+; PPC32-NEXT:    lwz 31, 28(1)
+; PPC32-NEXT:    addi 1, 1, 32
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: cloberAllNvCrField:
+; PPC64:       # %bb.0: # %entry
+; PPC64-NEXT:    mfcr 12
+; PPC64-NEXT:    stw 12, 8(1)
+; PPC64-NEXT:    #APP
+; PPC64-NEXT:    # clobbers
+; PPC64-NEXT:    #NO_APP
+; PPC64-NEXT:    lwz 12, 8(1)
+; PPC64-NEXT:    mtocrf 32, 12
+; PPC64-NEXT:    mtocrf 16, 12
+; PPC64-NEXT:    mtocrf 8, 12
+; PPC64-NEXT:    blr
+;
+; PPC64-ELFv2-LABEL: cloberAllNvCrField:
+; PPC64-ELFv2:       # %bb.0: # %entry
+; PPC64-ELFv2-NEXT:    mfcr 12
+; PPC64-ELFv2-NEXT:    stw 12, 8(1)
+; PPC64-ELFv2-NEXT:    #APP
+; PPC64-ELFv2-NEXT:    # clobbers
+; PPC64-ELFv2-NEXT:    #NO_APP
+; PPC64-ELFv2-NEXT:    lwz 12, 8(1)
+; PPC64-ELFv2-NEXT:    mtocrf 32, 12
+; PPC64-ELFv2-NEXT:    mtocrf 16, 12
+; PPC64-ELFv2-NEXT:    mtocrf 8, 12
+; PPC64-ELFv2-NEXT:    blr
 entry:
   tail call void asm sideeffect "# clobbers", "~{cr2},~{cr3},~{cr4}"()
   ret void
-
-; PPC64-ELFv2-LABEL: @cloberAllNvCrField
-; PPC64-ELFv2: mfcr [[REG1:[0-9]+]]
 }
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
index 90d78779b764d2..18b66499b85fe3 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=NOZACAS,RV32IA %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ZACAS,RV32IA-ZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=NOZACAS,RV64IA %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ZACAS,RV64IA-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas,+experimental-zabha -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas,+experimental-zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ZACAS,RV64IA-ZABHA %s
 
 ; Test cmpxchg followed by a branch on the cmpxchg success value to see if the
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
index 8df37bf40975c1..394dffa346ec63 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
@@ -3,25 +3,25 @@
 ; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-WMO %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-WMO-ZACAS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-TSO %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-ztso,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-ztso,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-TSO-ZACAS %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-WMO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas,+experimental-zabha -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas,+experimental-zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZABHA,RV64IA-WMO-ZABHA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-TSO %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-TSO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso,+experimental-zacas,+experimental-zabha -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso,+zacas,+experimental-zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZABHA,RV64IA-TSO-ZABHA %s
 
 define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index ee802507a02f3c..fe530017406133 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -12,22 +12,22 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-NOZACAS,RV64IA-TSO,RV64IA-TSO-NOZACAS %s
 
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-WMO,RV32IA-WMO-ZACAS %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-ztso,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-ztso,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-TSO,RV32IA-TSO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-WMO,RV64IA-WMO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-TSO,RV64IA-TSO-ZACAS %s
 
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO,RV64IA-WMO-ZABHA,RV64IA-WMO-ZABHA-NOZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso,+experimental-zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-TSO,RV64IA-TSO-ZABHA,RV64IA-TSO-ZABHA-NOZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zabha,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zabha,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO,RV64IA-WMO-ZABHA,RV64IA-WMO-ZABHA-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso,+experimental-zabha,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-ztso,+experimental-zabha,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-TSO,RV64IA-TSO-ZABHA,RV64IA-TSO-ZABHA-ZACAS %s
 
 define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index 47807f78d176e8..bdf3b28d2d523b 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -3,13 +3,13 @@
 ; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-NOZACAS %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-NOZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS %s
 
 define signext i8 @atomic_load_i8_unordered(ptr %a) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 13635a94d6411a..561b0f21dc3770 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -111,7 +111,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFWMA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-zacas %s -o - | FileCheck --check-prefix=RV32ZACAS %s
+; RUN: llc -mtriple=riscv32 -mattr=+zacas %s -o - | FileCheck --check-prefix=RV32ZACAS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zalasr %s -o - | FileCheck --check-prefix=RV32ZALASR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zalrsc %s -o - | FileCheck --check-prefix=RV32ZALRSC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfilp %s -o - | FileCheck --check-prefix=RV32ZICFILP %s
@@ -240,7 +240,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFWMA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zaamo %s -o - | FileCheck --check-prefix=RV64ZAAMO %s
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-zacas %s -o - | FileCheck --check-prefix=RV64ZACAS %s
+; RUN: llc -mtriple=riscv64 -mattr=+zacas %s -o - | FileCheck --check-prefix=RV64ZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zalasr %s -o - | FileCheck --check-prefix=RV64ZALASR %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zalrsc %s -o - | FileCheck --check-prefix=RV64ZALRSC %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfilp %s -o - | FileCheck --check-prefix=RV64ZICFILP %s
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 191f047131fb16..5d09c39dfd6e68 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -2849,23 +2849,21 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %cmp.not, label %for.cond.cleanup, label %for.body
 }
 
-declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
 
-define void @sink_splat_vp_mul(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_mul:
+define void @sink_splat_min(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_min:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB46_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vmul.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB46_1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a2, .LBB46_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2877,9 +2875,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
   store <4 x i32> %1, ptr %0, align 4
-  %index.next = add nuw i64 %index, 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -2887,23 +2885,19 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
-
-define void @sink_splat_vp_add(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_add:
+define void @sink_splat_min_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_min_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB47_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vadd.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB47_1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a2, .LBB47_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2915,9 +2909,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load)
   store <4 x i32> %1, ptr %0, align 4
-  %index.next = add nuw i64 %index, 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -2925,21 +2919,21 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_vp_add_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_add_commute:
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+
+define void @sink_splat_max(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_max:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB48_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vadd.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB48_1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a2, .LBB48_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2951,9 +2945,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
   store <4 x i32> %1, ptr %0, align 4
-  %index.next = add nuw i64 %index, 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -2961,23 +2955,19 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
-
-define void @sink_splat_vp_sub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_sub:
+define void @sink_splat_max_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_max_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB49_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB49_1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a2, .LBB49_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2989,9 +2979,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load)
   store <4 x i32> %1, ptr %0, align 4
-  %index.next = add nuw i64 %index, 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -2999,21 +2989,21 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_vp_rsub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_rsub:
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
+
+define void @sink_splat_umin(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_umin:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB50_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB50_1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a2, .LBB50_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3025,9 +3015,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
   store <4 x i32> %1, ptr %0, align 4
-  %index.next = add nuw i64 %index, 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -3035,23 +3025,19 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i32> @llvm.vp.shl.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
-
-define void @sink_splat_vp_shl(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_shl:
+define void @sink_splat_umin_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_umin_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB51_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vsll.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB51_1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a2, .LBB51_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3063,9 +3049,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load)
   store <4 x i32> %1, ptr %0, align 4
-  %index.next = add nuw i64 %index, 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -3073,23 +3059,21 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
 
-define void @sink_splat_vp_lshr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_lshr:
+define void @sink_splat_umax(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_umax:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB52_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmaxu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB52_1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a2, .LBB52_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3101,9 +3085,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
   store <4 x i32> %1, ptr %0, align 4
-  %index.next = add nuw i64 %index, 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -3111,23 +3095,19 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
-
-define void @sink_splat_vp_ashr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_ashr:
+define void @sink_splat_umax_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_umax_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB53_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vsra.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmaxu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB53_1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a2, .LBB53_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3139,9 +3119,9 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load)
   store <4 x i32> %1, ptr %0, align 4
-  %index.next = add nuw i64 %index, 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -3149,10 +3129,10 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x float> @llvm.vp.fmul.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
+declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
 
-define void @sink_splat_vp_fmul(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_fmul:
+define void @sink_splat_sadd_sat(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_sadd_sat:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a2, a0, a2
@@ -3160,25 +3140,23 @@ define void @sink_splat_vp_fmul(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer
 ; CHECK-NEXT:  .LBB54_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vfmul.vf v8, v8, fa0, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a0, a2, .LBB54_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
-  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds float, ptr %a, i64 %index
-  %wide.load = load <4 x float>, ptr %0, align 4
-  %1 = call <4 x float> @llvm.vp.fmul.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl)
-  store <4 x float> %1, ptr %0, align 4
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3187,10 +3165,8 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x float> @llvm.vp.fdiv.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
-
-define void @sink_splat_vp_fdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_fdiv:
+define void @sink_splat_sadd_sat_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_sadd_sat_commute:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a2, a0, a2
@@ -3198,25 +3174,23 @@ define void @sink_splat_vp_fdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer
 ; CHECK-NEXT:  .LBB55_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vfdiv.vf v8, v8, fa0, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a0, a2, .LBB55_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
-  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds float, ptr %a, i64 %index
-  %wide.load = load <4 x float>, ptr %0, align 4
-  %1 = call <4 x float> @llvm.vp.fdiv.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl)
-  store <4 x float> %1, ptr %0, align 4
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3225,35 +3199,35 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_vp_frdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_frdiv:
+declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
+
+define void @sink_splat_ssub_sat(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_ssub_sat:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB56_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB56_1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a2, .LBB56_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
-  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds float, ptr %a, i64 %index
-  %wide.load = load <4 x float>, ptr %0, align 4
-  %1 = call <4 x float> @llvm.vp.fdiv.v4i32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x i1> %m, i32 %vl)
-  store <4 x float> %1, ptr %0, align 4
-  %index.next = add nuw i64 %index, 4
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -3261,10 +3235,10 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x float> @llvm.vp.fadd.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
+declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>)
 
-define void @sink_splat_vp_fadd(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_fadd:
+define void @sink_splat_uadd_sat(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_uadd_sat:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a2, a0, a2
@@ -3272,25 +3246,23 @@ define void @sink_splat_vp_fadd(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer
 ; CHECK-NEXT:  .LBB57_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vf v8, v8, fa0, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a0, a2, .LBB57_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
-  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds float, ptr %a, i64 %index
-  %wide.load = load <4 x float>, ptr %0, align 4
-  %1 = call <4 x float> @llvm.vp.fadd.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl)
-  store <4 x float> %1, ptr %0, align 4
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3299,10 +3271,8 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x float> @llvm.vp.fsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
-
-define void @sink_splat_vp_fsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_fsub:
+define void @sink_splat_uadd_sat_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_uadd_sat_commute:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a2, a0, a2
@@ -3310,25 +3280,23 @@ define void @sink_splat_vp_fsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer
 ; CHECK-NEXT:  .LBB58_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vfsub.vf v8, v8, fa0, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a0, a2, .LBB58_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
-  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds float, ptr %a, i64 %index
-  %wide.load = load <4 x float>, ptr %0, align 4
-  %1 = call <4 x float> @llvm.vp.fsub.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl)
-  store <4 x float> %1, ptr %0, align 4
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3337,37 +3305,35 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x float> @llvm.vp.frsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
+declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
 
-define void @sink_splat_vp_frsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_frsub:
+define void @sink_splat_usub_sat(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_usub_sat:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB59_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vfrsub.vf v8, v8, fa0, v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB59_1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a2, .LBB59_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
-  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds float, ptr %a, i64 %index
-  %wide.load = load <4 x float>, ptr %0, align 4
-  %1 = call <4 x float> @llvm.vp.fsub.v4i32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x i1> %m, i32 %vl)
-  store <4 x float> %1, ptr %0, align 4
-  %index.next = add nuw i64 %index, 4
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -3375,10 +3341,10 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 
-define void @sink_splat_vp_udiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_udiv:
+define void @sink_splat_vp_mul(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_mul:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a3, 1
 ; CHECK-NEXT:    add a3, a0, a3
@@ -3387,7 +3353,7 @@ define void @sink_splat_vp_udiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vdivu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vmul.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -3403,7 +3369,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
   store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
@@ -3413,10 +3379,10 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 
-define void @sink_splat_vp_sdiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_sdiv:
+define void @sink_splat_vp_add(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_add:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a3, 1
 ; CHECK-NEXT:    add a3, a0, a3
@@ -3425,7 +3391,7 @@ define void @sink_splat_vp_sdiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vdiv.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vadd.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -3441,7 +3407,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
   store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
@@ -3451,10 +3417,8 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
-
-define void @sink_splat_vp_urem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_urem:
+define void @sink_splat_vp_add_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_add_commute:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a3, 1
 ; CHECK-NEXT:    add a3, a0, a3
@@ -3463,7 +3427,7 @@ define void @sink_splat_vp_urem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vremu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vadd.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -3479,7 +3443,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
   store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
@@ -3489,10 +3453,10 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 
-define void @sink_splat_vp_srem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_srem:
+define void @sink_splat_vp_sub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_sub:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a3, 1
 ; CHECK-NEXT:    add a3, a0, a3
@@ -3501,7 +3465,7 @@ define void @sink_splat_vp_srem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vrem.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -3517,7 +3481,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
   store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
@@ -3527,24 +3491,21 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-; Check that we don't sink a splat operand that has no chance of being folded.
-
-define void @sink_splat_vp_srem_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_srem_commute:
+define void @sink_splat_vp_rsub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_rsub:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:  .LBB64_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vrem.vv v9, v8, v9, v0.t
+; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a1, .LBB64_1
+; CHECK-NEXT:    bne a0, a3, .LBB64_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3556,7 +3517,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+  %1 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
   store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
@@ -3566,115 +3527,112 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32)
+declare <4 x i32> @llvm.vp.shl.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 
-define void @sink_splat_vp_fma(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_fma:
+define void @sink_splat_vp_shl(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_shl:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a1, a3
+; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB65_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vle32.v v9, (a1)
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    vsll.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a1, a3, .LBB65_1
+; CHECK-NEXT:    bne a0, a3, .LBB65_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
-  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds float, ptr %a, i64 %index
-  %wide.load = load <4 x float>, ptr %0, align 4
-  %1 = getelementptr inbounds float, ptr %b, i64 %index
-  %wide.load12 = load <4 x float>, ptr %1, align 4
-  %2 = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl)
-  store <4 x float> %2, ptr %0, align 4
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
-  %3 = icmp eq i64 %index.next, 1024
-  br i1 %3, label %for.cond.cleanup, label %vector.body
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
 
 for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_vp_fma_commute(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_fma_commute:
+declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_lshr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_lshr:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a1, a3
+; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB66_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vle32.v v9, (a1)
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a1, a3, .LBB66_1
+; CHECK-NEXT:    bne a0, a3, .LBB66_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
-  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds float, ptr %a, i64 %index
-  %wide.load = load <4 x float>, ptr %0, align 4
-  %1 = getelementptr inbounds float, ptr %b, i64 %index
-  %wide.load12 = load <4 x float>, ptr %1, align 4
-  %2 = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl)
-  store <4 x float> %2, ptr %0, align 4
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
-  %3 = icmp eq i64 %index.next, 1024
-  br i1 %3, label %for.cond.cleanup, label %vector.body
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
 
 for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
+declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 
-define void @sink_splat_mul_lmul2(ptr nocapture %a, i64 signext %x) {
-; CHECK-LABEL: sink_splat_mul_lmul2:
+define void @sink_splat_vp_ashr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_ashr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB67_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vmul.vx v8, v8, a1
-; CHECK-NEXT:    vse64.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB67_1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsra.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB67_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
-  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <4 x i64>, ptr %0, align 8
-  %1 = mul <4 x i64> %wide.load, %broadcast.splat
-  store <4 x i64> %1, ptr %0, align 8
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3683,32 +3641,36 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_add_lmul2(ptr nocapture %a, i64 signext %x) {
-; CHECK-LABEL: sink_splat_add_lmul2:
+declare <4 x float> @llvm.vp.fmul.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @sink_splat_vp_fmul(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_fmul:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB68_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vadd.vx v8, v8, a1
-; CHECK-NEXT:    vse64.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vfmul.vf v8, v8, fa0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a0, a2, .LBB68_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
-  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <4 x i64>, ptr %0, align 8
-  %1 = add <4 x i64> %wide.load, %broadcast.splat
-  store <4 x i64> %1, ptr %0, align 8
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = call <4 x float> @llvm.vp.fmul.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x float> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3717,32 +3679,36 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_sub_lmul2(ptr nocapture %a, i64 signext %x) {
-; CHECK-LABEL: sink_splat_sub_lmul2:
+declare <4 x float> @llvm.vp.fdiv.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @sink_splat_vp_fdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_fdiv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB69_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vsub.vx v8, v8, a1
-; CHECK-NEXT:    vse64.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vfdiv.vf v8, v8, fa0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a0, a2, .LBB69_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
-  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <4 x i64>, ptr %0, align 8
-  %1 = sub <4 x i64> %wide.load, %broadcast.splat
-  store <4 x i64> %1, ptr %0, align 8
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = call <4 x float> @llvm.vp.fdiv.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x float> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3751,32 +3717,1007 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_rsub_lmul2(ptr nocapture %a, i64 signext %x) {
-; CHECK-LABEL: sink_splat_rsub_lmul2:
+define void @sink_splat_vp_frdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_frdiv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB70_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB70_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = call <4 x float> @llvm.vp.fdiv.v4i32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x i1> %m, i32 %vl)
+  store <4 x float> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <4 x float> @llvm.vp.fadd.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @sink_splat_vp_fadd(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_fadd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB71_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vfadd.vf v8, v8, fa0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB71_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = call <4 x float> @llvm.vp.fadd.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x float> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <4 x float> @llvm.vp.fsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @sink_splat_vp_fsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_fsub:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB72_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vfsub.vf v8, v8, fa0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB72_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = call <4 x float> @llvm.vp.fsub.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x float> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <4 x float> @llvm.vp.frsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @sink_splat_vp_frsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_frsub:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB73_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vfrsub.vf v8, v8, fa0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB73_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = call <4 x float> @llvm.vp.fsub.v4i32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x i1> %m, i32 %vl)
+  store <4 x float> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_udiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_udiv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB74_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vdivu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB74_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_sdiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_sdiv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB75_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vdiv.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB75_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_urem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_urem:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB76_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vremu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB76_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_srem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_srem:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB77_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vrem.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB77_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+; Check that we don't sink a splat operand that has no chance of being folded.
+
+define void @sink_splat_vp_srem_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_srem_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:  .LBB78_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vrem.vv v9, v8, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a1, .LBB78_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @sink_splat_vp_fma(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_fma:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a1, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB79_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a1, a3, .LBB79_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = getelementptr inbounds float, ptr %b, i64 %index
+  %wide.load12 = load <4 x float>, ptr %1, align 4
+  %2 = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl)
+  store <4 x float> %2, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_vp_fma_commute(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_fma_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a1, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB80_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a1, a3, .LBB80_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, ptr %a, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = getelementptr inbounds float, ptr %b, i64 %index
+  %wide.load12 = load <4 x float>, ptr %1, align 4
+  %2 = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl)
+  store <4 x float> %2, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+
+define void @sink_splat_mul_lmul2(ptr nocapture %a, i64 signext %x) {
+; CHECK-LABEL: sink_splat_mul_lmul2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:  .LBB81_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmul.vx v8, v8, a1
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    bne a0, a2, .LBB81_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
+  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.load = load <4 x i64>, ptr %0, align 8
+  %1 = mul <4 x i64> %wide.load, %broadcast.splat
+  store <4 x i64> %1, ptr %0, align 8
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_add_lmul2(ptr nocapture %a, i64 signext %x) {
+; CHECK-LABEL: sink_splat_add_lmul2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:  .LBB82_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vadd.vx v8, v8, a1
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    bne a0, a2, .LBB82_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
+  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.load = load <4 x i64>, ptr %0, align 8
+  %1 = add <4 x i64> %wide.load, %broadcast.splat
+  store <4 x i64> %1, ptr %0, align 8
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_sub_lmul2(ptr nocapture %a, i64 signext %x) {
+; CHECK-LABEL: sink_splat_sub_lmul2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:  .LBB83_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    bne a0, a2, .LBB83_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
+  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.load = load <4 x i64>, ptr %0, align 8
+  %1 = sub <4 x i64> %wide.load, %broadcast.splat
+  store <4 x i64> %1, ptr %0, align 8
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_rsub_lmul2(ptr nocapture %a, i64 signext %x) {
+; CHECK-LABEL: sink_splat_rsub_lmul2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:  .LBB84_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    bne a0, a2, .LBB84_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
+  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.load = load <4 x i64>, ptr %0, align 8
+  %1 = sub <4 x i64> %broadcast.splat, %wide.load
+  store <4 x i64> %1, ptr %0, align 8
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_and_lmul2(ptr nocapture %a, i64 signext %x) {
+; CHECK-LABEL: sink_splat_and_lmul2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:  .LBB85_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    bne a0, a2, .LBB85_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
+  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.load = load <4 x i64>, ptr %0, align 8
+  %1 = and <4 x i64> %wide.load, %broadcast.splat
+  store <4 x i64> %1, ptr %0, align 8
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_or_lmul2(ptr nocapture %a, i64 signext %x) {
+; CHECK-LABEL: sink_splat_or_lmul2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:  .LBB86_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vor.vx v8, v8, a1
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    bne a0, a2, .LBB86_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
+  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.load = load <4 x i64>, ptr %0, align 8
+  %1 = or <4 x i64> %wide.load, %broadcast.splat
+  store <4 x i64> %1, ptr %0, align 8
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_xor_lmul2(ptr nocapture %a, i64 signext %x) {
+; CHECK-LABEL: sink_splat_xor_lmul2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:  .LBB87_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    bne a0, a2, .LBB87_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
+  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.load = load <4 x i64>, ptr %0, align 8
+  %1 = xor <4 x i64> %wide.load, %broadcast.splat
+  store <4 x i64> %1, ptr %0, align 8
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_mul_lmul8(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_mul_lmul8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:  .LBB88_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmul.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB88_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <32 x i32>, ptr %0, align 4
+  %1 = mul <32 x i32> %wide.load, %broadcast.splat
+  store <32 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_add_lmul8(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_add_lmul8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:  .LBB89_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vadd.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB89_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <32 x i32>, ptr %0, align 4
+  %1 = add <32 x i32> %wide.load, %broadcast.splat
+  store <32 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_sub_lmul8(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_sub_lmul8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:  .LBB90_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB90_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <32 x i32>, ptr %0, align 4
+  %1 = sub <32 x i32> %wide.load, %broadcast.splat
+  store <32 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_rsub_lmul8(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_rsub_lmul8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:  .LBB91_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB91_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <32 x i32>, ptr %0, align 4
+  %1 = sub <32 x i32> %broadcast.splat, %wide.load
+  store <32 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_and_lmul8(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_and_lmul8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:  .LBB92_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB92_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <32 x i32>, ptr %0, align 4
+  %1 = and <32 x i32> %wide.load, %broadcast.splat
+  store <32 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_or_lmul8(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_or_lmul8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:  .LBB93_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vor.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB93_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <32 x i32>, ptr %0, align 4
+  %1 = or <32 x i32> %wide.load, %broadcast.splat
+  store <32 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_xor_lmul8(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_xor_lmul8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:  .LBB94_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB94_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <32 x i32>, ptr %0, align 4
+  %1 = xor <32 x i32> %wide.load, %broadcast.splat
+  store <32 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_mul_lmulmf2(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_mul_lmulmf2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:  .LBB95_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmul.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    bne a0, a2, .LBB95_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
+  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.load = load <2 x i32>, ptr %0, align 8
+  %1 = mul <2 x i32> %wide.load, %broadcast.splat
+  store <2 x i32> %1, ptr %0, align 8
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_add_lmulmf2(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_add_lmulmf2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:  .LBB96_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vadd.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    bne a0, a2, .LBB96_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
+  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.load = load <2 x i32>, ptr %0, align 8
+  %1 = add <2 x i32> %wide.load, %broadcast.splat
+  store <2 x i32> %1, ptr %0, align 8
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_sub_lmulmf2(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_sub_lmulmf2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 2
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:  .LBB97_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB70_1
+; CHECK-NEXT:    bne a0, a2, .LBB97_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
-  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
+  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <4 x i64>, ptr %0, align 8
-  %1 = sub <4 x i64> %broadcast.splat, %wide.load
-  store <4 x i64> %1, ptr %0, align 8
+  %wide.load = load <2 x i32>, ptr %0, align 8
+  %1 = sub <2 x i32> %wide.load, %broadcast.splat
+  store <2 x i32> %1, ptr %0, align 8
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3785,32 +4726,32 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_and_lmul2(ptr nocapture %a, i64 signext %x) {
-; CHECK-LABEL: sink_splat_and_lmul2:
+define void @sink_splat_rsub_lmulmf2(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_rsub_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a2, 2
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:  .LBB71_1: # %vector.body
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:  .LBB98_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB71_1
+; CHECK-NEXT:    bne a0, a2, .LBB98_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
-  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
+  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <4 x i64>, ptr %0, align 8
-  %1 = and <4 x i64> %wide.load, %broadcast.splat
-  store <4 x i64> %1, ptr %0, align 8
+  %wide.load = load <2 x i32>, ptr %0, align 8
+  %1 = sub <2 x i32> %broadcast.splat, %wide.load
+  store <2 x i32> %1, ptr %0, align 8
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3819,32 +4760,32 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_or_lmul2(ptr nocapture %a, i64 signext %x) {
-; CHECK-LABEL: sink_splat_or_lmul2:
+define void @sink_splat_and_lmulmf2(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_and_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a2, 2
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:  .LBB72_1: # %vector.body
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:  .LBB99_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vor.vx v8, v8, a1
-; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB72_1
+; CHECK-NEXT:    bne a0, a2, .LBB99_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
-  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
+  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <4 x i64>, ptr %0, align 8
-  %1 = or <4 x i64> %wide.load, %broadcast.splat
-  store <4 x i64> %1, ptr %0, align 8
+  %wide.load = load <2 x i32>, ptr %0, align 8
+  %1 = and <2 x i32> %wide.load, %broadcast.splat
+  store <2 x i32> %1, ptr %0, align 8
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3853,32 +4794,32 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_xor_lmul2(ptr nocapture %a, i64 signext %x) {
-; CHECK-LABEL: sink_splat_xor_lmul2:
+define void @sink_splat_or_lmulmf2(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_or_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a2, 2
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:  .LBB73_1: # %vector.body
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:  .LBB100_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vor.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB73_1
+; CHECK-NEXT:    bne a0, a2, .LBB100_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0
-  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
+  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <4 x i64>, ptr %0, align 8
-  %1 = xor <4 x i64> %wide.load, %broadcast.splat
-  store <4 x i64> %1, ptr %0, align 8
+  %wide.load = load <2 x i32>, ptr %0, align 8
+  %1 = or <2 x i32> %wide.load, %broadcast.splat
+  store <2 x i32> %1, ptr %0, align 8
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3887,33 +4828,32 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_mul_lmul8(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_mul_lmul8:
+define void @sink_splat_xor_lmulmf2(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_xor_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    lui a2, 2
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:  .LBB74_1: # %vector.body
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:  .LBB101_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmul.vx v8, v8, a1
+; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB74_1
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    bne a0, a2, .LBB101_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
-  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
+  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.load = load <32 x i32>, ptr %0, align 4
-  %1 = mul <32 x i32> %wide.load, %broadcast.splat
-  store <32 x i32> %1, ptr %0, align 4
+  %0 = getelementptr inbounds i64, ptr %a, i64 %index
+  %wide.load = load <2 x i32>, ptr %0, align 8
+  %1 = xor <2 x i32> %wide.load, %broadcast.splat
+  store <2 x i32> %1, ptr %0, align 8
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3922,33 +4862,39 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_add_lmul8(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_add_lmul8:
+declare <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32>, <4 x i32>, metadata, <4 x i1>, i32)
+
+define void @sink_splat_vp_icmp(ptr nocapture %x, i32 signext %y, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_icmp:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:  .LBB75_1: # %vector.body
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:  .LBB102_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vadd.vx v8, v8, a1
-; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmseq.vx v0, v10, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v9, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB75_1
+; CHECK-NEXT:    bne a0, a3, .LBB102_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
-  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %y, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.load = load <32 x i32>, ptr %0, align 4
-  %1 = add <32 x i32> %wide.load, %broadcast.splat
-  store <32 x i32> %1, ptr %0, align 4
+  %0 = getelementptr inbounds i32, ptr %x, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, metadata !"eq", <4 x i1> %m, i32 %vl)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr %0, i32 4, <4 x i1> %1)
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3957,33 +4903,39 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_sub_lmul8(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_sub_lmul8:
+declare <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float>, <4 x float>, metadata, <4 x i1>, i32)
+
+define void @sink_splat_vp_fcmp(ptr nocapture %x, float %y, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_fcmp:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:  .LBB76_1: # %vector.body
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:  .LBB103_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsub.vx v8, v8, a1
-; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfeq.vf v0, v10, fa0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v9, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB76_1
+; CHECK-NEXT:    bne a0, a2, .LBB103_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
-  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %y, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.load = load <32 x i32>, ptr %0, align 4
-  %1 = sub <32 x i32> %wide.load, %broadcast.splat
-  store <32 x i32> %1, ptr %0, align 4
+  %0 = getelementptr inbounds float, ptr %x, i64 %index
+  %wide.load = load <4 x float>, ptr %0, align 4
+  %1 = call <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, metadata !"oeq", <4 x i1> %m, i32 %vl)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> zeroinitializer, ptr %0, i32 4, <4 x i1> %1)
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3992,33 +4944,36 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_rsub_lmul8(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_rsub_lmul8:
+declare <4 x i32> @llvm.vp.smin.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_min(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_min:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:  .LBB77_1: # %vector.body
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB104_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB77_1
+; CHECK-NEXT:    bne a0, a3, .LBB104_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
-  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.load = load <32 x i32>, ptr %0, align 4
-  %1 = sub <32 x i32> %broadcast.splat, %wide.load
-  store <32 x i32> %1, ptr %0, align 4
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.smin.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4027,33 +4982,34 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_and_lmul8(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_and_lmul8:
+define void @sink_splat_vp_min_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_min_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:  .LBB78_1: # %vector.body
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB105_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB78_1
+; CHECK-NEXT:    bne a0, a3, .LBB105_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
-  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.load = load <32 x i32>, ptr %0, align 4
-  %1 = and <32 x i32> %wide.load, %broadcast.splat
-  store <32 x i32> %1, ptr %0, align 4
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.smin.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4062,33 +5018,36 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_or_lmul8(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_or_lmul8:
+declare <4 x i32> @llvm.vp.smax.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_max(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_max:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:  .LBB79_1: # %vector.body
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB106_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vor.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB79_1
+; CHECK-NEXT:    bne a0, a3, .LBB106_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
-  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.load = load <32 x i32>, ptr %0, align 4
-  %1 = or <32 x i32> %wide.load, %broadcast.splat
-  store <32 x i32> %1, ptr %0, align 4
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.smax.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4097,33 +5056,34 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_xor_lmul8(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_xor_lmul8:
+define void @sink_splat_vp_max_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_max_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:  .LBB80_1: # %vector.body
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB107_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB80_1
+; CHECK-NEXT:    bne a0, a3, .LBB107_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0
-  %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %0 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.load = load <32 x i32>, ptr %0, align 4
-  %1 = xor <32 x i32> %wide.load, %broadcast.splat
-  store <32 x i32> %1, ptr %0, align 4
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.smax.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4132,32 +5092,34 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_mul_lmulmf2(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_mul_lmulmf2:
+define void @sink_splat_vp_umin_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_umin_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:  .LBB81_1: # %vector.body
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB108_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmul.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vminu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB81_1
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB108_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
-  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <2 x i32>, ptr %0, align 8
-  %1 = mul <2 x i32> %wide.load, %broadcast.splat
-  store <2 x i32> %1, ptr %0, align 8
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.umin.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4166,32 +5128,36 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_add_lmulmf2(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_add_lmulmf2:
+declare <4 x i32> @llvm.vp.umax.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_umax(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_umax:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:  .LBB82_1: # %vector.body
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB109_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vadd.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vmaxu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB82_1
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB109_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
-  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <2 x i32>, ptr %0, align 8
-  %1 = add <2 x i32> %wide.load, %broadcast.splat
-  store <2 x i32> %1, ptr %0, align 8
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.umax.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4200,32 +5166,34 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_sub_lmulmf2(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_sub_lmulmf2:
+define void @sink_splat_vp_umax_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_umax_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:  .LBB83_1: # %vector.body
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB110_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vmaxu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB83_1
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB110_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
-  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <2 x i32>, ptr %0, align 8
-  %1 = sub <2 x i32> %wide.load, %broadcast.splat
-  store <2 x i32> %1, ptr %0, align 8
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.umax.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4234,32 +5202,36 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_rsub_lmulmf2(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_rsub_lmulmf2:
+declare <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_sadd_sat(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_sadd_sat:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:  .LBB84_1: # %vector.body
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB111_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB84_1
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB111_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
-  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <2 x i32>, ptr %0, align 8
-  %1 = sub <2 x i32> %broadcast.splat, %wide.load
-  store <2 x i32> %1, ptr %0, align 8
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4268,32 +5240,34 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_and_lmulmf2(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_and_lmulmf2:
+define void @sink_splat_vp_sadd_sat_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_sadd_sat_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:  .LBB85_1: # %vector.body
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB112_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB85_1
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB112_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
-  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <2 x i32>, ptr %0, align 8
-  %1 = and <2 x i32> %wide.load, %broadcast.splat
-  store <2 x i32> %1, ptr %0, align 8
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4302,33 +5276,37 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_or_lmulmf2(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_or_lmulmf2:
+declare <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_ssub_sat(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_ssub_sat:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:  .LBB86_1: # %vector.body
+; CHECK-NEXT:    li a3, 1024
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB113_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vor.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB86_1
+; CHECK-NEXT:    addi a3, a3, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a3, .LBB113_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
-  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <2 x i32>, ptr %0, align 8
-  %1 = or <2 x i32> %wide.load, %broadcast.splat
-  store <2 x i32> %1, ptr %0, align 8
-  %index.next = add nuw i64 %index, 4
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
@@ -4336,32 +5314,36 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-define void @sink_splat_xor_lmulmf2(ptr nocapture %a, i32 signext %x) {
-; CHECK-LABEL: sink_splat_xor_lmulmf2:
+declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define void @sink_splat_vp_uadd_sat(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_uadd_sat:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:  .LBB87_1: # %vector.body
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB114_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB87_1
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a3, .LBB114_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0
-  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.load = load <2 x i32>, ptr %0, align 8
-  %1 = xor <2 x i32> %wide.load, %broadcast.splat
-  store <2 x i32> %1, ptr %0, align 8
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4370,39 +5352,34 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32>, <4 x i32>, metadata, <4 x i1>, i32)
-
-define void @sink_splat_vp_icmp(ptr nocapture %x, i32 signext %y, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_icmp:
+define void @sink_splat_vp_uadd_sat_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_uadd_sat_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    lui a3, 1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:  .LBB88_1: # %vector.body
+; CHECK-NEXT:  .LBB115_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmseq.vx v0, v10, a1, v0.t
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v9, (a0), v0.t
+; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB88_1
+; CHECK-NEXT:    bne a0, a3, .LBB115_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %y, i32 0
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds i32, ptr %x, i64 %index
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
   %wide.load = load <4 x i32>, ptr %0, align 4
-  %1 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, metadata !"eq", <4 x i1> %m, i32 %vl)
-  call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr %0, i32 4, <4 x i1> %1)
+  %1 = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
   %index.next = add nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -4411,40 +5388,37 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
-declare <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float>, <4 x float>, metadata, <4 x i1>, i32)
+declare <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 
-define void @sink_splat_vp_fcmp(ptr nocapture %x, float %y, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_fcmp:
+define void @sink_splat_vp_usub_sat(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_usub_sat:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmv1r.v v8, v0
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:  .LBB89_1: # %vector.body
+; CHECK-NEXT:  .LBB116_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmfeq.vf v0, v10, fa0, v0.t
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v9, (a0), v0.t
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB89_1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, 4
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    bnez a3, .LBB116_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
-  %broadcast.splatinsert = insertelement <4 x float> poison, float %y, i32 0
-  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds float, ptr %x, i64 %index
-  %wide.load = load <4 x float>, ptr %0, align 4
-  %1 = call <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, metadata !"oeq", <4 x i1> %m, i32 %vl)
-  call void @llvm.masked.store.v4f32.p0(<4 x float> zeroinitializer, ptr %0, i32 4, <4 x i1> %1)
-  %index.next = add nuw i64 %index, 4
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = sub nuw i64 %index, 4
   %2 = icmp eq i64 %index.next, 1024
   br i1 %2, label %for.cond.cleanup, label %vector.body
 
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index abfe3e6428e663..22aae4de4db9d2 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -2171,19 +2171,14 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
 ; KNL-LABEL: test_concat_v2i1:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT:    vpextrw $0, %xmm0, %eax
-; KNL-NEXT:    movzwl %ax, %eax
-; KNL-NEXT:    vmovd %eax, %xmm1
+; KNL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; KNL-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; KNL-NEXT:    vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; KNL-NEXT:    vucomiss %xmm2, %xmm1
 ; KNL-NEXT:    setb %al
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    vpsrld $16, %xmm0, %xmm0
-; KNL-NEXT:    vpextrw $0, %xmm0, %eax
-; KNL-NEXT:    movzwl %ax, %eax
-; KNL-NEXT:    vmovd %eax, %xmm0
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; KNL-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; KNL-NEXT:    vucomiss %xmm2, %xmm0
 ; KNL-NEXT:    setb %al
@@ -2212,19 +2207,14 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
 ; SKX-LABEL: test_concat_v2i1:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; SKX-NEXT:    vpextrw $0, %xmm1, %eax
-; SKX-NEXT:    movzwl %ax, %eax
-; SKX-NEXT:    vmovd %eax, %xmm1
+; SKX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; SKX-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; SKX-NEXT:    vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SKX-NEXT:    vucomiss %xmm2, %xmm1
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    kmovd %eax, %k0
 ; SKX-NEXT:    kshiftlb $1, %k0, %k0
-; SKX-NEXT:    vpextrw $0, %xmm0, %eax
-; SKX-NEXT:    movzwl %ax, %eax
-; SKX-NEXT:    vmovd %eax, %xmm0
+; SKX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; SKX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; SKX-NEXT:    vucomiss %xmm2, %xmm0
 ; SKX-NEXT:    setb %al
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index e2b46846731d04..b2c54160a0eae2 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1436,10 +1436,9 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) {
 ; KNL:       ## %bb.0: ## %entry
 ; KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; KNL-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; KNL-NEXT:    vpsrld $16, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xd0,0x10]
-; KNL-NEXT:    vpextrw $0, %xmm1, %eax ## encoding: [0xc5,0xf9,0xc5,0xc1,0x00]
-; KNL-NEXT:    movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0]
-; KNL-NEXT:    vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
+; KNL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; KNL-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
+; KNL-NEXT:    ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; KNL-NEXT:    vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
 ; KNL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; KNL-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
@@ -1449,9 +1448,8 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) {
 ; KNL-NEXT:    movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00]
 ; KNL-NEXT:    cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1]
 ; KNL-NEXT:    cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1]
-; KNL-NEXT:    vpextrw $0, %xmm0, %edi ## encoding: [0xc5,0xf9,0xc5,0xf8,0x00]
-; KNL-NEXT:    movzwl %di, %edi ## encoding: [0x0f,0xb7,0xff]
-; KNL-NEXT:    vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
+; KNL-NEXT:    vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0]
+; KNL-NEXT:    ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; KNL-NEXT:    vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
 ; KNL-NEXT:    vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
 ; KNL-NEXT:    cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1]
@@ -1468,10 +1466,9 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) {
 ; AVX512BW:       ## %bb.0: ## %entry
 ; AVX512BW-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512BW-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xd0,0x10]
-; AVX512BW-NEXT:    vpextrw $0, %xmm1, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc1,0x00]
-; AVX512BW-NEXT:    movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0]
-; AVX512BW-NEXT:    vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
+; AVX512BW-NEXT:    ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; AVX512BW-NEXT:    vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
 ; AVX512BW-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; AVX512BW-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
@@ -1481,9 +1478,8 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) {
 ; AVX512BW-NEXT:    movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00]
 ; AVX512BW-NEXT:    cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1]
 ; AVX512BW-NEXT:    cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1]
-; AVX512BW-NEXT:    vpextrw $0, %xmm0, %edi ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xf8,0x00]
-; AVX512BW-NEXT:    movzwl %di, %edi ## encoding: [0x0f,0xb7,0xff]
-; AVX512BW-NEXT:    vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
+; AVX512BW-NEXT:    vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0]
+; AVX512BW-NEXT:    ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
 ; AVX512BW-NEXT:    vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
 ; AVX512BW-NEXT:    cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1]
@@ -1500,10 +1496,9 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) {
 ; SKX:       ## %bb.0: ## %entry
 ; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SKX-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; SKX-NEXT:    vpsrld $16, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xd0,0x10]
-; SKX-NEXT:    vpextrw $0, %xmm1, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc1,0x00]
-; SKX-NEXT:    movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0]
-; SKX-NEXT:    vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; SKX-NEXT:    ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x0d,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; SKX-NEXT:    vcvtph2ps %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc9]
 ; SKX-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
 ; SKX-NEXT:    vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca]
@@ -1512,9 +1507,8 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) {
 ; SKX-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
 ; SKX-NEXT:    testb %cl, %cl ## encoding: [0x84,0xc9]
 ; SKX-NEXT:    setne %al ## encoding: [0x0f,0x95,0xc0]
-; SKX-NEXT:    vpextrw $0, %xmm0, %ecx ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc8,0x00]
-; SKX-NEXT:    movzwl %cx, %ecx ## encoding: [0x0f,0xb7,0xc9]
-; SKX-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; SKX-NEXT:    vpmovzxwq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xc0]
+; SKX-NEXT:    ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; SKX-NEXT:    vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
 ; SKX-NEXT:    vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
 ; SKX-NEXT:    setp %cl ## encoding: [0x0f,0x9a,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll
index e2ea8974f6551f..f6fb2fcc957ef2 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll
@@ -92,7 +92,7 @@ define half @f6(half %x, i16 %y) {
 define half @f7(half %x) {
 ; CHECK-LABEL: f7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm1 = [1.7881E-7,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast half %x to i16
@@ -106,7 +106,7 @@ define half @f7(half %x) {
 define half @f8(half %x) {
 ; CHECK-LABEL: f8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm1 = [2.3842E-7,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast half %x to i16
@@ -171,7 +171,7 @@ define half @xor(half %x, half %y) {
 define half @f7_or(half %x) {
 ; CHECK-LABEL: f7_or:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm1 = [1.7881E-7,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast half %x to i16
@@ -183,7 +183,7 @@ define half @f7_or(half %x) {
 define half @f7_xor(half %x) {
 ; CHECK-LABEL: f7_xor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm1 = [1.7881E-7,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast half %x to i16
@@ -199,7 +199,7 @@ define half @f7_xor(half %x) {
 define half @movmsk(half %x) {
 ; CHECK-LABEL: movmsk:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast half %x to i16
@@ -271,7 +271,7 @@ define half @fadd_bitcast_fneg(half %x, half %y) {
 define half @fsub_bitcast_fneg(half %x, half %y) {
 ; CHECK-LABEL: fsub_bitcast_fneg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vxorps %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vsubsh %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index c8708ea9b681fe..a3fb71f817ce47 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -699,34 +699,23 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) nounwind {
 ; AVX2-LABEL: stest_f16i32:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT:    vcvttss2si %xmm1, %rcx
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vmovq %rcx, %xmm1
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm0, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -849,9 +838,6 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
 ; AVX2-LABEL: utesth_f16i32:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm2
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
@@ -860,37 +846,29 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
 ; AVX2-NEXT:    andq %rax, %rdx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    vmovq %rdx, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
-; AVX2-NEXT:    vmovq %rdx, %xmm2
-; AVX2-NEXT:    vpextrw $0, %xmm0, %edx
-; AVX2-NEXT:    movzwl %dx, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm3
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
-; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    andq %rax, %rdx
-; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm4
 ; AVX2-NEXT:    vcvttss2si %xmm4, %rax
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm4
-; AVX2-NEXT:    vcvttss2si %xmm3, %rcx
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm3[0]
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
 ; AVX2-NEXT:    andq %rax, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
@@ -901,7 +879,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    andq %rax, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
@@ -1024,34 +1002,23 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind {
 ; AVX2-LABEL: ustest_f16i32:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vcvttss2si %xmm0, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -3347,34 +3314,23 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) nounwind {
 ; AVX2-LABEL: stest_f16i32_mm:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT:    vcvttss2si %xmm1, %rcx
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vmovq %rcx, %xmm1
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm0, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
@@ -3495,9 +3451,6 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
 ; AVX2-LABEL: utesth_f16i32_mm:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm2
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
@@ -3506,37 +3459,29 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
 ; AVX2-NEXT:    andq %rax, %rdx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    vmovq %rdx, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
-; AVX2-NEXT:    vmovq %rdx, %xmm2
-; AVX2-NEXT:    vpextrw $0, %xmm0, %edx
-; AVX2-NEXT:    movzwl %dx, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm3
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
-; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    andq %rax, %rdx
-; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm4
 ; AVX2-NEXT:    vcvttss2si %xmm4, %rax
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm4
-; AVX2-NEXT:    vcvttss2si %xmm3, %rcx
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm3[0]
 ; AVX2-NEXT:    movq %rcx, %rdx
 ; AVX2-NEXT:    sarq $63, %rdx
 ; AVX2-NEXT:    andq %rax, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
@@ -3547,7 +3492,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    andq %rax, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm1
@@ -3669,34 +3614,23 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind {
 ; AVX2-LABEL: ustest_f16i32_mm:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX2-NEXT:    vcvttss2si %xmm1, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm2
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    movzwl %ax, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vcvttss2si %xmm0, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index d0853fdc748d29..2e1322446032ff 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1614,15 +1614,10 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ;
 ; BWON-F16C-LABEL: maxnum_v8f16:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; BWON-F16C-NEXT:    vpextrw $0, %xmm2, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm2
+; BWON-F16C-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,11,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; BWON-F16C-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; BWON-F16C-NEXT:    vpextrw $0, %xmm3, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm3
+; BWON-F16C-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
 ; BWON-F16C-NEXT:    vucomiss %xmm2, %xmm3
 ; BWON-F16C-NEXT:    ja .LBB26_2
@@ -1630,15 +1625,10 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:    vmovaps %xmm2, %xmm3
 ; BWON-F16C-NEXT:  .LBB26_2:
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm3, %xmm2
-; BWON-F16C-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; BWON-F16C-NEXT:    vpextrw $0, %xmm3, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm3
+; BWON-F16C-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; BWON-F16C-NEXT:    vpshufb %xmm4, %xmm1, %xmm3
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; BWON-F16C-NEXT:    vpextrw $0, %xmm4, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm4
+; BWON-F16C-NEXT:    vpshufb %xmm4, %xmm0, %xmm4
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm4, %xmm4
 ; BWON-F16C-NEXT:    vucomiss %xmm3, %xmm4
 ; BWON-F16C-NEXT:    ja .LBB26_4
@@ -1648,49 +1638,33 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:    vmovd %xmm2, %eax
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm4, %xmm2
 ; BWON-F16C-NEXT:    vmovd %xmm2, %ecx
-; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; BWON-F16C-NEXT:    vpextrw $0, %xmm2, %edx
-; BWON-F16C-NEXT:    movzwl %dx, %edx
-; BWON-F16C-NEXT:    vmovd %edx, %xmm2
-; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; BWON-F16C-NEXT:    vpextrw $0, %xmm3, %edx
-; BWON-F16C-NEXT:    movzwl %dx, %edx
-; BWON-F16C-NEXT:    vmovd %edx, %xmm3
+; BWON-F16C-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; BWON-F16C-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT:    vucomiss %xmm2, %xmm3
+; BWON-F16C-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
+; BWON-F16C-NEXT:    vucomiss %xmm3, %xmm2
 ; BWON-F16C-NEXT:    ja .LBB26_6
 ; BWON-F16C-NEXT:  # %bb.5:
-; BWON-F16C-NEXT:    vmovaps %xmm2, %xmm3
+; BWON-F16C-NEXT:    vmovaps %xmm3, %xmm2
 ; BWON-F16C-NEXT:  .LBB26_6:
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm3, %xmm2
+; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
 ; BWON-F16C-NEXT:    vmovd %xmm2, %edx
-; BWON-F16C-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; BWON-F16C-NEXT:    vpextrw $0, %xmm2, %esi
-; BWON-F16C-NEXT:    movzwl %si, %esi
-; BWON-F16C-NEXT:    vmovd %esi, %xmm2
+; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm3
+; BWON-F16C-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; BWON-F16C-NEXT:    vpextrw $0, %xmm3, %esi
-; BWON-F16C-NEXT:    movzwl %si, %esi
-; BWON-F16C-NEXT:    vmovd %esi, %xmm3
-; BWON-F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT:    vucomiss %xmm2, %xmm3
+; BWON-F16C-NEXT:    vucomiss %xmm3, %xmm2
 ; BWON-F16C-NEXT:    ja .LBB26_8
 ; BWON-F16C-NEXT:  # %bb.7:
-; BWON-F16C-NEXT:    vmovaps %xmm2, %xmm3
+; BWON-F16C-NEXT:    vmovaps %xmm3, %xmm2
 ; BWON-F16C-NEXT:  .LBB26_8:
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm3, %xmm2
+; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
 ; BWON-F16C-NEXT:    vmovd %xmm2, %esi
-; BWON-F16C-NEXT:    vpsrlq $48, %xmm1, %xmm2
-; BWON-F16C-NEXT:    vpextrw $0, %xmm2, %edi
-; BWON-F16C-NEXT:    movzwl %di, %edi
-; BWON-F16C-NEXT:    vmovd %edi, %xmm2
+; BWON-F16C-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; BWON-F16C-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT:    vpsrlq $48, %xmm0, %xmm3
-; BWON-F16C-NEXT:    vpextrw $0, %xmm3, %edi
-; BWON-F16C-NEXT:    movzwl %di, %edi
-; BWON-F16C-NEXT:    vmovd %edi, %xmm3
+; BWON-F16C-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm3, %xmm6
 ; BWON-F16C-NEXT:    vucomiss %xmm2, %xmm6
 ; BWON-F16C-NEXT:    ja .LBB26_10
@@ -1703,54 +1677,39 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:    vpinsrw $0, %esi, %xmm0, %xmm5
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
 ; BWON-F16C-NEXT:    vmovd %xmm6, %eax
-; BWON-F16C-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; BWON-F16C-NEXT:    vpextrw $0, %xmm6, %ecx
-; BWON-F16C-NEXT:    movzwl %cx, %ecx
-; BWON-F16C-NEXT:    vmovd %ecx, %xmm6
+; BWON-F16C-NEXT:    vpsrlq $48, %xmm1, %xmm6
+; BWON-F16C-NEXT:    vcvtph2ps %xmm6, %xmm7
+; BWON-F16C-NEXT:    vpsrlq $48, %xmm0, %xmm6
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm6, %xmm6
-; BWON-F16C-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; BWON-F16C-NEXT:    vpextrw $0, %xmm7, %ecx
-; BWON-F16C-NEXT:    movzwl %cx, %ecx
-; BWON-F16C-NEXT:    vmovd %ecx, %xmm7
-; BWON-F16C-NEXT:    vcvtph2ps %xmm7, %xmm7
-; BWON-F16C-NEXT:    vucomiss %xmm6, %xmm7
+; BWON-F16C-NEXT:    vucomiss %xmm7, %xmm6
 ; BWON-F16C-NEXT:    ja .LBB26_12
 ; BWON-F16C-NEXT:  # %bb.11:
-; BWON-F16C-NEXT:    vmovaps %xmm6, %xmm7
+; BWON-F16C-NEXT:    vmovaps %xmm7, %xmm6
 ; BWON-F16C-NEXT:  .LBB26_12:
 ; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm7, %xmm5
+; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm6, %xmm5
 ; BWON-F16C-NEXT:    vmovd %xmm5, %eax
 ; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; BWON-F16C-NEXT:    vpextrw $0, %xmm1, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm6
-; BWON-F16C-NEXT:    vcvtph2ps %xmm6, %xmm6
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm7
+; BWON-F16C-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; BWON-F16C-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm7, %xmm7
-; BWON-F16C-NEXT:    vucomiss %xmm6, %xmm7
+; BWON-F16C-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
+; BWON-F16C-NEXT:    vcvtph2ps %xmm6, %xmm6
+; BWON-F16C-NEXT:    vucomiss %xmm7, %xmm6
 ; BWON-F16C-NEXT:    ja .LBB26_14
 ; BWON-F16C-NEXT:  # %bb.13:
-; BWON-F16C-NEXT:    vmovaps %xmm6, %xmm7
+; BWON-F16C-NEXT:    vmovaps %xmm7, %xmm6
 ; BWON-F16C-NEXT:  .LBB26_14:
-; BWON-F16C-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm7, %xmm4
+; BWON-F16C-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm6, %xmm4
 ; BWON-F16C-NEXT:    vmovd %xmm4, %eax
 ; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; BWON-F16C-NEXT:    vpsrld $16, %xmm1, %xmm1
-; BWON-F16C-NEXT:    vpextrw $0, %xmm1, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm1
+; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT:    vpsrld $16, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    movzwl %ax, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
+; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm0
 ; BWON-F16C-NEXT:    ja .LBB26_16
@@ -1760,7 +1719,7 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 {
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vmovd %xmm0, %eax
 ; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; BWON-F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
 ; BWON-F16C-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; BWON-F16C-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; BWON-F16C-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll
index fa1014e3ae0da6..a21653bc7330c9 100644
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -41,15 +41,11 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
 ;
 ; F16C-LABEL: ir_fadd_v1f16:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vpextrw $0, %xmm1, %ecx
-; F16C-NEXT:    movzwl %cx, %ecx
-; F16C-NEXT:    vmovd %ecx, %xmm0
-; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    movzwl %ax, %eax
-; F16C-NEXT:    vmovd %eax, %xmm1
+; F16C-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    vmovd %xmm0, %eax
 ; F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr34605.ll b/llvm/test/CodeGen/X86/pr34605.ll
index b4c10c4597e051..eeb8317fb58a7a 100644
--- a/llvm/test/CodeGen/X86/pr34605.ll
+++ b/llvm/test/CodeGen/X86/pr34605.ll
@@ -17,7 +17,7 @@ define void @pr34605(i8* nocapture %s, i32 %p) {
 ; CHECK-NEXT:    kmovd %ecx, %k1
 ; CHECK-NEXT:    kmovd %k1, %k1
 ; CHECK-NEXT:    kandq %k1, %k0, %k1
-; CHECK-NEXT:    vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqu8 {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    vmovdqu64 %zmm0, (%eax)
 ; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqu64 %zmm0, 64(%eax)
diff --git a/llvm/test/CodeGen/X86/pr38803.ll b/llvm/test/CodeGen/X86/pr38803.ll
index 53d31b70e902a4..986d558c482b47 100644
--- a/llvm/test/CodeGen/X86/pr38803.ll
+++ b/llvm/test/CodeGen/X86/pr38803.ll
@@ -13,7 +13,7 @@ define dso_local float @_Z3fn2v() {
 ; CHECK-NEXT:    callq _Z1av@PLT
 ; CHECK-NEXT:    # kill: def $al killed $al def $eax
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = [7.5E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    cmpl $0, c(%rip)
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
diff --git a/llvm/test/CodeGen/X86/pr43509.ll b/llvm/test/CodeGen/X86/pr43509.ll
index 87ddad03e9c452..a29fe4c6a0465a 100644
--- a/llvm/test/CodeGen/X86/pr43509.ll
+++ b/llvm/test/CodeGen/X86/pr43509.ll
@@ -7,7 +7,7 @@ define <8 x i8> @foo(<8 x float> %arg) {
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vcmpltps %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1 {%k1}
-; CHECK-NEXT:    vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 bb:
diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll
index 57f52c8dcdbb06..95f839c338e701 100644
--- a/llvm/test/CodeGen/X86/pr57340.ll
+++ b/llvm/test/CodeGen/X86/pr57340.ll
@@ -5,54 +5,43 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-LABEL: main.41:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpbroadcastw (%rax), %xmm0
-; CHECK-NEXT:    vmovdqu (%rax), %ymm2
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm1
-; CHECK-NEXT:    vpextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm0
-; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqu (%rax), %xmm5
-; CHECK-NEXT:    vpextrw $0, %xmm5, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm2
+; CHECK-NEXT:    vmovdqu (%rax), %ymm1
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovdqu (%rax), %xmm10
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
 ; CHECK-NEXT:    vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT:    vucomiss %xmm0, %xmm2
+; CHECK-NEXT:    vpshufb %xmm1, %xmm3, %xmm4
+; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm4
+; CHECK-NEXT:    vucomiss %xmm4, %xmm2
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
-; CHECK-NEXT:    vpsrld $16, %xmm1, %xmm3
-; CHECK-NEXT:    vpextrw $0, %xmm3, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm3
-; CHECK-NEXT:    vpsrld $16, %xmm5, %xmm4
-; CHECK-NEXT:    vpextrw $0, %xmm4, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm4
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    vcvtph2ps %xmm3, %xmm6
-; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm3
-; CHECK-NEXT:    kmovw %eax, %k0
-; CHECK-NEXT:    vucomiss %xmm6, %xmm3
+; CHECK-NEXT:    kmovd %eax, %k0
+; CHECK-NEXT:    kshiftlw $15, %k0, %k0
+; CHECK-NEXT:    kshiftrw $14, %k0, %k0
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
+; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm11
+; CHECK-NEXT:    vucomiss %xmm0, %xmm11
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    kshiftlw $15, %k1, %k1
-; CHECK-NEXT:    kshiftrw $14, %k1, %k1
-; CHECK-NEXT:    korw %k1, %k0, %k0
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    korw %k0, %k1, %k0
 ; CHECK-NEXT:    movw $-5, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vprolq $32, %xmm1, %xmm4
-; CHECK-NEXT:    vpextrw $0, %xmm4, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm4
-; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm4
-; CHECK-NEXT:    vucomiss %xmm4, %xmm0
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,5,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; CHECK-NEXT:    vcvtph2ps %xmm5, %xmm5
+; CHECK-NEXT:    vucomiss %xmm5, %xmm0
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -63,18 +52,12 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-9, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpsrlq $48, %xmm1, %xmm4
-; CHECK-NEXT:    vpextrw $0, %xmm4, %eax
+; CHECK-NEXT:    vpsrlq $48, %xmm3, %xmm5
+; CHECK-NEXT:    vcvtph2ps %xmm5, %xmm6
+; CHECK-NEXT:    vpsrlq $48, %xmm10, %xmm5
+; CHECK-NEXT:    vcvtph2ps %xmm5, %xmm5
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm4
-; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm6
-; CHECK-NEXT:    vpsrlq $48, %xmm5, %xmm4
-; CHECK-NEXT:    vpextrw $0, %xmm4, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm4
-; CHECK-NEXT:    vcvtph2ps %xmm4, %xmm4
-; CHECK-NEXT:    vucomiss %xmm6, %xmm4
+; CHECK-NEXT:    vucomiss %xmm6, %xmm5
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -85,13 +68,11 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-17, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
-; CHECK-NEXT:    vpextrw $0, %xmm6, %eax
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm6
-; CHECK-NEXT:    vcvtph2ps %xmm6, %xmm6
-; CHECK-NEXT:    vucomiss %xmm6, %xmm0
+; CHECK-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
+; CHECK-NEXT:    vcvtph2ps %xmm7, %xmm7
+; CHECK-NEXT:    vucomiss %xmm7, %xmm0
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -102,18 +83,13 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-33, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpextrw $0, %xmm6, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm6
-; CHECK-NEXT:    vcvtph2ps %xmm6, %xmm7
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpextrw $0, %xmm6, %eax
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-NEXT:    vpshufb %xmm7, %xmm10, %xmm8
+; CHECK-NEXT:    vcvtph2ps %xmm8, %xmm8
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm6
-; CHECK-NEXT:    vcvtph2ps %xmm6, %xmm6
-; CHECK-NEXT:    vucomiss %xmm7, %xmm6
+; CHECK-NEXT:    vpshufb %xmm7, %xmm3, %xmm9
+; CHECK-NEXT:    vcvtph2ps %xmm9, %xmm9
+; CHECK-NEXT:    vucomiss %xmm9, %xmm8
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -124,13 +100,11 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-65, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm9 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-NEXT:    vpshufb %xmm9, %xmm3, %xmm12
+; CHECK-NEXT:    vcvtph2ps %xmm12, %xmm12
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; CHECK-NEXT:    vpextrw $0, %xmm7, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm7
-; CHECK-NEXT:    vcvtph2ps %xmm7, %xmm7
-; CHECK-NEXT:    vucomiss %xmm7, %xmm0
+; CHECK-NEXT:    vucomiss %xmm12, %xmm0
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -142,17 +116,11 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-129, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpextrw $0, %xmm7, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm7
-; CHECK-NEXT:    vcvtph2ps %xmm7, %xmm7
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpextrw $0, %xmm5, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm5
-; CHECK-NEXT:    vcvtph2ps %xmm5, %xmm5
-; CHECK-NEXT:    vucomiss %xmm7, %xmm5
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm12 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vcvtph2ps %xmm12, %xmm12
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm10 = xmm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vcvtph2ps %xmm10, %xmm10
+; CHECK-NEXT:    vucomiss %xmm12, %xmm10
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -163,13 +131,11 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-257, %ax # imm = 0xFEFF
 ; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; CHECK-NEXT:    vcvtph2ps %xmm12, %xmm12
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vpextrw $0, %xmm1, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm7
-; CHECK-NEXT:    vcvtph2ps %xmm7, %xmm7
-; CHECK-NEXT:    vucomiss %xmm7, %xmm2
+; CHECK-NEXT:    vucomiss %xmm12, %xmm11
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -181,12 +147,9 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-513, %ax # imm = 0xFDFF
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpsrld $16, %xmm1, %xmm2
-; CHECK-NEXT:    vpextrw $0, %xmm2, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm2
-; CHECK-NEXT:    vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT:    vucomiss %xmm2, %xmm3
+; CHECK-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT:    vucomiss %xmm1, %xmm2
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -197,13 +160,10 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-1025, %ax # imm = 0xFBFF
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vprolq $32, %xmm1, %xmm2
-; CHECK-NEXT:    vpextrw $0, %xmm2, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm2
-; CHECK-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vucomiss %xmm2, %xmm0
+; CHECK-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
+; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -215,12 +175,9 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-2049, %ax # imm = 0xF7FF
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpsrlq $48, %xmm1, %xmm2
-; CHECK-NEXT:    vpextrw $0, %xmm2, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm2
-; CHECK-NEXT:    vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT:    vucomiss %xmm2, %xmm4
+; CHECK-NEXT:    vpsrlq $48, %xmm3, %xmm1
+; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT:    vucomiss %xmm1, %xmm5
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -231,13 +188,10 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-4097, %ax # imm = 0xEFFF
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; CHECK-NEXT:    vpextrw $0, %xmm2, %eax
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm2
-; CHECK-NEXT:    vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT:    vucomiss %xmm2, %xmm0
+; CHECK-NEXT:    vpshufb %xmm6, %xmm3, %xmm1
+; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -249,12 +203,9 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    movw $-8193, %ax # imm = 0xDFFF
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpextrw $0, %xmm2, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm2
-; CHECK-NEXT:    vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT:    vucomiss %xmm2, %xmm6
+; CHECK-NEXT:    vpshufb %xmm7, %xmm3, %xmm1
+; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
+; CHECK-NEXT:    vucomiss %xmm1, %xmm8
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -265,13 +216,10 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    movw $-16385, %ax # imm = 0xBFFF
 ; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb %xmm9, %xmm3, %xmm1
+; CHECK-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; CHECK-NEXT:    vpextrw $0, %xmm2, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm2
-; CHECK-NEXT:    vcvtph2ps %xmm2, %xmm2
-; CHECK-NEXT:    vucomiss %xmm2, %xmm0
+; CHECK-NEXT:    vucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -280,13 +228,10 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    kshiftlw $14, %k1, %k1
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    kshiftlw $1, %k0, %k0
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpextrw $0, %xmm0, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm0
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-NEXT:    kshiftrw $1, %k0, %k0
-; CHECK-NEXT:    vucomiss %xmm0, %xmm5
+; CHECK-NEXT:    vucomiss %xmm0, %xmm10
 ; CHECK-NEXT:    setnp %al
 ; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    testb %al, %cl
@@ -294,7 +239,7 @@ define void @main.41() local_unnamed_addr #1 {
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    kshiftlw $15, %k1, %k1
 ; CHECK-NEXT:    korw %k1, %k0, %k1
-; CHECK-NEXT:    vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    vmovdqa %xmm0, (%rax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr78897.ll b/llvm/test/CodeGen/X86/pr78897.ll
index 0c1c3cafc4ea6b..56e4ec2bc8ecbb 100644
--- a/llvm/test/CodeGen/X86/pr78897.ll
+++ b/llvm/test/CodeGen/X86/pr78897.ll
@@ -225,7 +225,7 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-AVX512-NEXT:    pushl %esi
 ; X86-AVX512-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %xmm0
 ; X86-AVX512-NEXT:    vptestnmb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %k1
-; X86-AVX512-NEXT:    vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 {%k1} {z}
+; X86-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
 ; X86-AVX512-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-AVX512-NEXT:    vmovd %xmm0, %edx
 ; X86-AVX512-NEXT:    movl $286331152, %ecx # imm = 0x11111110
@@ -258,7 +258,7 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X64-AVX512:       # %bb.0: # %entry
 ; X64-AVX512-NEXT:    vpbroadcastb %edi, %xmm0
 ; X64-AVX512-NEXT:    vptestnmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
-; X64-AVX512-NEXT:    vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
+; X64-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax
 ; X64-AVX512-NEXT:    movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110
 ; X64-AVX512-NEXT:    movabsq $76861433640456465, %rdx # imm = 0x111111111111111
diff --git a/llvm/test/CodeGen/X86/select-of-fp-constants.ll b/llvm/test/CodeGen/X86/select-of-fp-constants.ll
index 76b8ea8e2b8a2b..2cdaa11a22530b 100644
--- a/llvm/test/CodeGen/X86/select-of-fp-constants.ll
+++ b/llvm/test/CodeGen/X86/select-of-fp-constants.ll
@@ -86,7 +86,7 @@ define float @fcmp_select_fp_constants(float %x) nounwind readnone {
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vcmpneqss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
 ; X64-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0]
-; X64-AVX512F-NEXT:    vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1}
+; X64-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; X64-AVX512F-NEXT:    retq
  %c = fcmp une float %x, -4.0
  %r = select i1 %c, float 42.0, float 23.0
diff --git a/llvm/test/CodeGen/X86/select-of-half-constants.ll b/llvm/test/CodeGen/X86/select-of-half-constants.ll
index e22d4c8b792dca..e3d92eb474968a 100644
--- a/llvm/test/CodeGen/X86/select-of-half-constants.ll
+++ b/llvm/test/CodeGen/X86/select-of-half-constants.ll
@@ -6,9 +6,9 @@
 define half @fcmp_select_fp_constants_olt(half %x) nounwind readnone {
 ; X64-AVX512FP16-LABEL: fcmp_select_fp_constants_olt:
 ; X64-AVX512FP16:       # %bb.0:
-; X64-AVX512FP16-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-AVX512FP16-NEXT:    vmovsh {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-AVX512FP16-NEXT:    vcmpltsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
-; X64-AVX512FP16-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-AVX512FP16-NEXT:    vmovsh {{.*#+}} xmm0 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-AVX512FP16-NEXT:    vmovsh %xmm1, %xmm0, %xmm0 {%k1}
 ; X64-AVX512FP16-NEXT:    retq
   %c = fcmp olt half %x, -4.0
@@ -19,9 +19,9 @@ define half @fcmp_select_fp_constants_olt(half %x) nounwind readnone {
 define half @fcmp_select_fp_constants_ogt(half %x) nounwind readnone {
 ; X64-AVX512FP16-LABEL: fcmp_select_fp_constants_ogt:
 ; X64-AVX512FP16:       # %bb.0:
-; X64-AVX512FP16-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-AVX512FP16-NEXT:    vmovsh {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-AVX512FP16-NEXT:    vcmpgtsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
-; X64-AVX512FP16-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-AVX512FP16-NEXT:    vmovsh {{.*#+}} xmm0 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-AVX512FP16-NEXT:    vmovsh %xmm1, %xmm0, %xmm0 {%k1}
 ; X64-AVX512FP16-NEXT:    retq
   %c = fcmp ogt half %x, -4.0
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index f59960f06f4a11..3b82df5d5b74d2 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -4976,32 +4976,22 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
 ;
 ; F16C-LABEL: fptosi_2f16_to_4i32:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    movzwl %ax, %eax
-; F16C-NEXT:    vmovd %eax, %xmm1
+; F16C-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT:    vpsrld $16, %xmm0, %xmm0
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    movzwl %ax, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; F16C-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; F16C-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; F16C-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; F16C-NEXT:    retq
 ;
 ; AVX512-LABEL: fptosi_2f16_to_4i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512-NEXT:    movzwl %ax, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512-NEXT:    movzwl %ax, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index 71c4427da96253..6e8eefc607ee11 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -413,13 +413,9 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT:    movzwl %ax, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512F-NEXT:    movzwl %ax, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm3
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
 ; AVX512F-NEXT:    xorl %eax, %eax
 ; AVX512F-NEXT:    vucomiss %xmm3, %xmm2
@@ -434,13 +430,9 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512VL-LABEL: test_v2f16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512VL-NEXT:    movzwl %ax, %eax
-; AVX512VL-NEXT:    vmovd %eax, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512VL-NEXT:    movzwl %ax, %eax
-; AVX512VL-NEXT:    vmovd %eax, %xmm3
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
 ; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vucomiss %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
index 0b2f9d69f0623c..804ca183ad4c9d 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
@@ -412,13 +412,9 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT:    movzwl %ax, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512F-NEXT:    movzwl %ax, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm3
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
 ; AVX512F-NEXT:    xorl %eax, %eax
 ; AVX512F-NEXT:    vucomiss %xmm3, %xmm2
@@ -433,13 +429,9 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512VL-LABEL: test_v2f16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512VL-NEXT:    movzwl %ax, %eax
-; AVX512VL-NEXT:    vmovd %eax, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512VL-NEXT:    movzwl %ax, %eax
-; AVX512VL-NEXT:    vmovd %eax, %xmm3
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
 ; AVX512VL-NEXT:    xorl %eax, %eax
 ; AVX512VL-NEXT:    vucomiss %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 3603ff466e24c1..04d88a297506d7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2012,24 +2012,18 @@ define <4 x i32> @extract3_insert0_v4i32_7123(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE2-LABEL: extract3_insert0_v4i32_7123:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: extract3_insert0_v4i32_7123:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE3-NEXT:    movd %xmm1, %eax
-; SSE3-NEXT:    movd %eax, %xmm1
 ; SSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: extract3_insert0_v4i32_7123:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSSE3-NEXT:    movd %xmm1, %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSSE3-NEXT:    retq
 ;
diff --git a/llvm/test/MC/ARM/thumbv8m.s b/llvm/test/MC/ARM/thumbv8m.s
index f03dd03dae3a4f..0e9ab4a9b3bf91 100644
--- a/llvm/test/MC/ARM/thumbv8m.s
+++ b/llvm/test/MC/ARM/thumbv8m.s
@@ -184,13 +184,13 @@ ttat r0, r1
 // 'Lazy Load/Store Multiple'
 
 // UNDEF-BASELINE: error: instruction requires: armv8m.main
-// CHECK-MAINLINE: vlldm r5, {d0 - d15} @ encoding: [0x35,0xec,0x00,0x0a]
-// CHECK-MAINLINE_DSP: vlldm r5, {d0 - d15} @ encoding: [0x35,0xec,0x00,0x0a]
+// CHECK-MAINLINE: vlldm r5          @ encoding: [0x35,0xec,0x00,0x0a]
+// CHECK-MAINLINE_DSP: vlldm r5      @ encoding: [0x35,0xec,0x00,0x0a]
 vlldm r5
 
 // UNDEF-BASELINE: error: instruction requires: armv8m.main
-// CHECK-MAINLINE: vlstm r10, {d0 - d15} @ encoding: [0x2a,0xec,0x00,0x0a]
-// CHECK-MAINLINE_DSP: vlstm r10, {d0 - d15} @ encoding: [0x2a,0xec,0x00,0x0a]
+// CHECK-MAINLINE: vlstm r10         @ encoding: [0x2a,0xec,0x00,0x0a]
+// CHECK-MAINLINE_DSP: vlstm r10     @ encoding: [0x2a,0xec,0x00,0x0a]
 vlstm r10
 
 // New SYSm's
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s b/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
deleted file mode 100644
index 4e35883ffe4332..00000000000000
--- a/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s \
-// RUN: | FileCheck --check-prefixes=CHECK %s
-
-// RUN: llvm-mc -triple=thumbv8.1m.main-none-eabi -mcpu=generic -show-encoding %s \
-// RUN: | FileCheck --check-prefixes=CHECK %s
-
-vlstm r8, {d0 - d31}
-// CHECK: vlstm	r8, {d0 - d31} @ encoding: [0x28,0xec,0x80,0x0a]
-
-vlldm r8, {d0 - d31}
-// CHECK: vlldm	r8, {d0 - d31} @ encoding: [0x38,0xec,0x80,0x0a]
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-8m.s b/llvm/test/MC/ARM/vlstm-vlldm-8m.s
deleted file mode 100644
index bbc95318aeb3d0..00000000000000
--- a/llvm/test/MC/ARM/vlstm-vlldm-8m.s
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: llvm-mc -triple=armv8m.main-arm-none-eabi -mcpu=generic -show-encoding %s \
-// RUN: | FileCheck --check-prefixes=CHECK %s
-
-// RUN: llvm-mc -triple=thumbv8m.main-none-eabi -mcpu=generic -show-encoding %s \
-// RUN: | FileCheck --check-prefixes=CHECK %s
-
-vlstm r8, {d0 - d15}
-// CHECK: vlstm	r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
-
-vlldm r8, {d0 - d15}
-// CHECK: vlldm	r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
-
-vlstm r8
-// CHECK: vlstm	r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
-
-vlldm r8
-// CHECK: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-diag.s b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
deleted file mode 100644
index b57f535c6a25cf..00000000000000
--- a/llvm/test/MC/ARM/vlstm-vlldm-diag.s
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: not llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s 2>&1 >/dev/null \
-// RUN: | FileCheck --check-prefixes=ERR %s
-
-// RUN: not llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s 2>&1 >/dev/null \
-// RUN: | FileCheck --check-prefixes=ERRT2 %s
-
-vlstm r8, {d0 - d11}
-// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
-// ERR-NEXT: vlstm r8, {d0 - d11}
-
-vlldm r8, {d0 - d11}
-// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
-// ERR-NEXT: vlldm r8, {d0 - d11}
-
-vlstm r8, {d3 - d15}
-// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
-// ERR-NEXT: vlstm r8, {d3 - d15}
-
-vlldm r8, {d3 - d15}
-// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
-// ERR-NEXT: vlldm r8, {d3 - d15}
-
-vlstm r8, {d0 - d29}
-// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
-// ERR-NEXT: vlstm r8, {d0 - d29}
-
-vlldm r8, {d0 - d29}
-// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
-// ERR-NEXT: vlldm r8, {d0 - d29}
-
-vlstm r8, {d3 - d31}
-// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
-// ERR-NEXT: vlstm r8, {d3 - d31}
-
-vlldm r8, {d3 - d31}
-// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
-// ERR-NEXT: vlldm r8, {d3 - d31}
-
-vlstm r8, {d0 - d35}
-// ERR: error: register expected
-// ERR-NEXT: vlstm r8, {d0 - d35}
-
-vlldm r8, {d0 - d35}
-// ERR: error: register expected
-// ERR-NEXT: vlldm r8, {d0 - d35}
-
-vlstm pc
-// ERR: error: operand must be a register in range [r0, r14]
-// ERR-NEXT: vlstm pc
-
-vlldm pc
-// ERR: error: operand must be a register in range [r0, r14]
-// ERR-NEXT: vlldm pc
-
-vlstm pc
-// ERRT2: error: operand must be a register in range [r0, r14]
-// ERRT2-NEXT: vlstm pc
-
-vlldm pc
-// ERRT2: error: operand must be a register in range [r0, r14]
-// ERRT2-NEXT: vlldm pc
\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
deleted file mode 100644
index 6b9882454c06a3..00000000000000
--- a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding -disassemble %s \
-// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
-
-// RUN: llvm-mc -triple=thumbv8.1m.main-none-eabi -mcpu=generic -show-encoding -disassemble %s \
-// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
-
-[0x28,0xec,0x80,0x0a]
-// CHECK-DISS: vlstm r8, {d0 - d31} @ encoding: [0x28,0xec,0x80,0x0a]
-
-[0x38,0xec,0x80,0x0a]
-// CHECK-DISS: vlldm r8, {d0 - d31} @ encoding: [0x38,0xec,0x80,0x0a]
\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt
deleted file mode 100644
index 1e28d5284c5b2a..00000000000000
--- a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: llvm-mc -triple=armv8m.main-arm-none-eabi -mcpu=generic -show-encoding -disassemble %s \
-// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
-
-// RUN: llvm-mc -triple=thumbv8m.main-none-eabi -mcpu=generic -show-encoding -disassemble %s \
-// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
-
-[0x28,0xec,0x00,0x0a]
-// CHECK-DISS: vlstm r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
-
-[0x38,0xec,0x00,0x0a]
-// CHECK-DISS: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
-
-[0x28,0xec,0x00,0x0a]
-// CHECK-DISS: vlstm r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
-
-[0x38,0xec,0x00,0x0a]
-// CHECK-DISS: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
\ No newline at end of file
diff --git a/llvm/test/MC/RISCV/rv32zacas-invalid.s b/llvm/test/MC/RISCV/rv32zacas-invalid.s
index b86246ca2ed180..11d20dacd8a788 100644
--- a/llvm/test/MC/RISCV/rv32zacas-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zacas-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zacas < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple riscv32 -mattr=+zacas < %s 2>&1 | FileCheck %s
 
 # Non-zero offsets not supported for the third operand (rs1).
 amocas.w a1, a3, 1(a5) # CHECK: :[[@LINE]]:18: error: optional integer offset must be 0
diff --git a/llvm/test/MC/RISCV/rv32zacas-valid.s b/llvm/test/MC/RISCV/rv32zacas-valid.s
index d80b963f0a0477..05a9cdd5cc2188 100644
--- a/llvm/test/MC/RISCV/rv32zacas-valid.s
+++ b/llvm/test/MC/RISCV/rv32zacas-valid.s
@@ -1,12 +1,12 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zacas < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zacas < %s \
+# RUN:     | llvm-objdump --mattr=+zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zacas < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zacas < %s \
+# RUN:     | llvm-objdump --mattr=+zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 # RUN: not llvm-mc -triple=riscv32 -mattr=+a -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rv64zacas-valid.s b/llvm/test/MC/RISCV/rv64zacas-valid.s
index 843401b50871af..694f43b9b4407b 100644
--- a/llvm/test/MC/RISCV/rv64zacas-valid.s
+++ b/llvm/test/MC/RISCV/rv64zacas-valid.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zacas < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zacas < %s \
+# RUN:     | llvm-objdump --mattr=+zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 # RUN: not llvm-mc -triple=riscv64 -mattr=+a -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
index 8ad2f99d3febad..f1f705e625b87a 100644
--- a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
+++ b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
@@ -1,12 +1,12 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zabha,+experimental-zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zabha,+zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zabha,+experimental-zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zabha,+zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zabha,+experimental-zacas < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zabha,+experimental-zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zabha,+zacas < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zabha,+zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zabha,+experimental-zacas < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zabha,+experimental-zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zabha,+zacas < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zabha,+zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 # RUN: not llvm-mc -triple=riscv32 -mattr=+experimental-zabha -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/Transforms/InstCombine/maxnum.ll b/llvm/test/Transforms/InstCombine/maxnum.ll
index 87288b18cbcd9f..e140a5b405ea84 100644
--- a/llvm/test/Transforms/InstCombine/maxnum.ll
+++ b/llvm/test/Transforms/InstCombine/maxnum.ll
@@ -66,7 +66,7 @@ define float @constant_fold_maxnum_f32_p0_n0() {
 
 define float @constant_fold_maxnum_f32_n0_p0() {
 ; CHECK-LABEL: @constant_fold_maxnum_f32_n0_p0(
-; CHECK-NEXT:    ret float -0.000000e+00
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %x = call float @llvm.maxnum.f32(float -0.0, float 0.0)
   ret float %x
diff --git a/llvm/test/Transforms/InstCombine/minnum.ll b/llvm/test/Transforms/InstCombine/minnum.ll
index 8050f075595272..cc6171b9d8e6cb 100644
--- a/llvm/test/Transforms/InstCombine/minnum.ll
+++ b/llvm/test/Transforms/InstCombine/minnum.ll
@@ -60,7 +60,7 @@ define float @constant_fold_minnum_f32_p0_p0() {
 
 define float @constant_fold_minnum_f32_p0_n0() {
 ; CHECK-LABEL: @constant_fold_minnum_f32_p0_n0(
-; CHECK-NEXT:    ret float 0.000000e+00
+; CHECK-NEXT:    ret float -0.000000e+00
 ;
   %x = call float @llvm.minnum.f32(float 0.0, float -0.0)
   ret float %x
@@ -199,7 +199,7 @@ define float @minnum_f32_1_minnum_p0_val_fmf3(float %x) {
 
 define float @minnum_f32_p0_minnum_val_n0(float %x) {
 ; CHECK-LABEL: @minnum_f32_p0_minnum_val_n0(
-; CHECK-NEXT:    [[Z:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0.000000e+00)
+; CHECK-NEXT:    [[Z:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float -0.000000e+00)
 ; CHECK-NEXT:    ret float [[Z]]
 ;
   %y = call float @llvm.minnum.f32(float %x, float -0.0)
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/min-max.ll b/llvm/test/Transforms/InstSimplify/ConstProp/min-max.ll
index a5f5d4e12ed842..9120649eb5c4f1 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/min-max.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/min-max.ll
@@ -49,6 +49,38 @@ define float @minnum_float() {
   ret float %1
 }
 
+define float @minnum_float_p0_n0() {
+; CHECK-LABEL: @minnum_float_p0_n0(
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %min = call float @llvm.minnum.f32(float 0.0, float -0.0)
+  ret float %min
+}
+
+define float @minnum_float_n0_p0() {
+; CHECK-LABEL: @minnum_float_n0_p0(
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %min = call float @llvm.minnum.f32(float -0.0, float 0.0)
+  ret float %min
+}
+
+define float @minnum_float_p0_qnan() {
+; CHECK-LABEL: @minnum_float_p0_qnan(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %min = call float @llvm.minnum.f32(float 0.0, float 0x7FF8000000000000)
+  ret float %min
+}
+
+define float @minnum_float_qnan_p0() {
+; CHECK-LABEL: @minnum_float_qnan_p0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %min = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0.0)
+  ret float %min
+}
+
 define bfloat @minnum_bfloat() {
 ; CHECK-LABEL: @minnum_bfloat(
 ; CHECK-NEXT:    ret bfloat 0xR40A0
@@ -95,7 +127,7 @@ define <4 x half> @minnum_half_vec() {
 
 define <4 x float> @minnum_float_zeros_vec() {
 ; CHECK-LABEL: @minnum_float_zeros_vec(
-; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float -0.000000e+00, float 0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
 ;
   %1 = call <4 x float> @llvm.minnum.v4f32(<4 x float> <float 0.0, float -0.0, float 0.0, float -0.0>, <4 x float> <float 0.0, float 0.0, float -0.0, float -0.0>)
   ret <4 x float> %1
@@ -109,6 +141,38 @@ define float @maxnum_float() {
   ret float %1
 }
 
+define float @maxnum_float_p0_n0() {
+; CHECK-LABEL: @maxnum_float_p0_n0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %max = call float @llvm.maxnum.f32(float 0.0, float -0.0)
+  ret float %max
+}
+
+define float @maxnum_float_n0_p0() {
+; CHECK-LABEL: @maxnum_float_n0_p0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %max = call float @llvm.maxnum.f32(float -0.0, float 0.0)
+  ret float %max
+}
+
+define float @maxnum_float_p0_qnan() {
+; CHECK-LABEL: @maxnum_float_p0_qnan(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %max = call float @llvm.maxnum.f32(float 0.0, float 0x7FF8000000000000)
+  ret float %max
+}
+
+define float @maxnum_float_qnan_p0() {
+; CHECK-LABEL: @maxnum_float_qnan_p0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %max = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0.0)
+  ret float %max
+}
+
 define bfloat @maxnum_bfloat() {
 ; CHECK-LABEL: @maxnum_bfloat(
 ; CHECK-NEXT:    ret bfloat 0xR4228
@@ -155,7 +219,7 @@ define <4 x half> @maxnum_half_vec() {
 
 define <4 x float> @maxnum_float_zeros_vec() {
 ; CHECK-LABEL: @maxnum_float_zeros_vec(
-; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float -0.000000e+00, float 0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float -0.000000e+00>
 ;
   %1 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> <float 0.0, float -0.0, float 0.0, float -0.0>, <4 x float> <float 0.0, float 0.0, float -0.0, float -0.0>)
   ret <4 x float> %1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index da6dc34e409684..72d9691b2bb877 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -36,10 +36,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 11 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 11 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -86,10 +86,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 11 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 11 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -112,7 +112,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
-; CHECK-NEXT:  LV: Loop cost is 28
+; CHECK-NEXT:  LV: Loop cost is 32
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
@@ -122,6 +122,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
+; CHECK-EMPTY:
 ;
 entry:
   %cmp7 = icmp sgt i32 %n, 0
@@ -176,10 +177,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 11 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 11 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -226,10 +227,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 11 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 11 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -252,7 +253,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
-; CHECK-NEXT:  LV: Loop cost is 28
+; CHECK-NEXT:  LV: Loop cost is 32
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index baf055e503b7e7..6e4dda8351a1b1 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -578,6 +578,11 @@ TEST(APFloatTest, MinNum) {
   EXPECT_EQ(1.0, minnum(f2, f1).convertToDouble());
   EXPECT_EQ(1.0, minnum(f1, nan).convertToDouble());
   EXPECT_EQ(1.0, minnum(nan, f1).convertToDouble());
+
+  APFloat zp(0.0);
+  APFloat zn(-0.0);
+  EXPECT_EQ(-0.0, minnum(zp, zn).convertToDouble());
+  EXPECT_EQ(-0.0, minnum(zn, zp).convertToDouble());
 }
 
 TEST(APFloatTest, MaxNum) {
@@ -589,6 +594,11 @@ TEST(APFloatTest, MaxNum) {
   EXPECT_EQ(2.0, maxnum(f2, f1).convertToDouble());
   EXPECT_EQ(1.0, maxnum(f1, nan).convertToDouble());
   EXPECT_EQ(1.0, maxnum(nan, f1).convertToDouble());
+
+  APFloat zp(0.0);
+  APFloat zn(-0.0);
+  EXPECT_EQ(0.0, maxnum(zp, zn).convertToDouble());
+  EXPECT_EQ(0.0, maxnum(zn, zp).convertToDouble());
 }
 
 TEST(APFloatTest, Minimum) {
diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index df4c7f7de8a3d2..82cce23638d5f5 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -752,6 +752,7 @@ R"(All available -march extensions for RISC-V
     zmmul               1.0
     za128rs             1.0
     za64rs              1.0
+    zacas               1.0
     zawrs               1.0
     zfa                 1.0
     zfh                 1.0
@@ -873,7 +874,6 @@ Experimental extensions
     zimop               0.1
     zaamo               0.2
     zabha               1.0
-    zacas               1.0
     zalasr              0.1
     zalrsc              0.2
     zfbfmin             1.0
diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
index 3a76054ca4f36d..aeb25bf012d034 100644
--- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp
+++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
@@ -1126,9 +1126,7 @@ TEST(MachineInstr, HasSideEffects) {
       VLDR_VPR_post,
       VLDR_VPR_pre,
       VLLDM,
-      VLLDM_T2,
       VLSTM,
-      VLSTM_T2,
       VMRS,
       VMRS_FPCXTNS,
       VMRS_FPCXTS,
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 955dd1e20d2488..bb373afa40ad99 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -24,6 +24,7 @@ include "mlir/IR/EnumAttr.td"
 include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/DataLayoutInterfaces.td"
+include "mlir/IR/OpAsmInterface.td"
 include "mlir/Interfaces/FunctionInterfaces.td"
 include "mlir/Interfaces/InferIntRangeInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
@@ -50,9 +51,21 @@ def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">;
 
 class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
     GPU_Op<mnemonic, !listconcat(traits, [
-        Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>])>,
+        Pure,
+        DeclareOpInterfaceMethods<InferIntRangeInterface>,
+        DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>])>,
     Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> {
   let assemblyFormat = "$dimension attr-dict";
+  let extraClassDefinition = [{
+    void $cppClass::getAsmResultNames(
+        llvm::function_ref<void(mlir::Value, mlir::StringRef)> setNameFn) {
+      auto dimStr = stringifyDimension(getDimensionAttr().getValue());
+      auto opName = getOperationName();
+      opName.consume_front("gpu.");
+      SmallString<8> resultName({opName, "_", dimStr});
+      setNameFn(getResult(),resultName);
+    }
+  }];
 }
 
 def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 8d249c9e9b9b8a..511b018877476f 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -59,24 +59,39 @@ module attributes {gpu.container_module} {
   gpu.module @kernels {
     gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) kernel {
       %tIdX = gpu.thread_id x
+      // CHECK:      thread_id_x
       %tIdY = gpu.thread_id y
+      // CHECK-NEXT: thread_id_y
       %tIdZ = gpu.thread_id z
+      // CHECK-NEXT: thread_id_z
 
       %bDimX = gpu.block_dim x
+      // CHECK-NEXT: block_dim_x
       %bDimY = gpu.block_dim y
+      // CHECK-NEXT: block_dim_y
       %bDimZ = gpu.block_dim z
+      // CHECK-NEXT: block_dim_z
 
       %bIdX = gpu.block_id x
+      // CHECK-NEXT: block_id_x
       %bIdY = gpu.block_id y
+      // CHECK-NEXT: block_id_y
       %bIdZ = gpu.block_id z
+      // CHECK-NEXT: block_id_z
 
       %gDimX = gpu.grid_dim x
+      // CHECK-NEXT: grid_dim_x
       %gDimY = gpu.grid_dim y
+      // CHECK-NEXT: grid_dim_y
       %gDimZ = gpu.grid_dim z
+      // CHECK-NEXT: grid_dim_z
 
       %gIdX = gpu.global_id x
+      // CHECK-NEXT: global_id_x
       %gIdY = gpu.global_id y
+      // CHECK-NEXT: global_id_y
       %gIdZ = gpu.global_id z
+      // CHECK-NEXT: global_id_z
 
       %sgId = gpu.subgroup_id : index
       %numSg = gpu.num_subgroups : index
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
index 22cf15d4f64047..51a0c8f7c945aa 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
@@ -36,7 +36,7 @@ func.func @matmul_f32() {
 
   // Print and verify the output
   // F32-LABEL: SVE: START OF TEST OUTPUT
-  vector.print str "SVE: START OF TEST OUTPUT"
+  vector.print str "SVE: START OF TEST OUTPUT\n"
 
   // F32-NEXT: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [5, 15] strides = [15, 1] data =
   // F32-COUNT-5: [29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788]
@@ -44,7 +44,7 @@ func.func @matmul_f32() {
   call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
 
   // F32-NEXT: SVE: END OF TEST OUTPUT
-  vector.print str "SVE: END OF TEST OUTPUT"
+  vector.print str "SVE: END OF TEST OUTPUT\n"
 
   return
 }
@@ -73,7 +73,7 @@ func.func @matmul_mixed_ty() {
 
   // Print and verify the output
   // MIXED-LABEL: SVE: START OF TEST OUTPUT
-  vector.print str "SVE: START OF TEST OUTPUT"
+  vector.print str "SVE: START OF TEST OUTPUT\n"
 
   // MIXED-NEXT: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [5, 15] strides = [15, 1] data =
   // MIXED-COUNT-5: [45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387]
@@ -81,7 +81,7 @@ func.func @matmul_mixed_ty() {
   call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()
 
   // MIXED-NEXT: SVE: END OF TEST OUTPUT
-  vector.print str "SVE: END OF TEST OUTPUT"
+  vector.print str "SVE: END OF TEST OUTPUT\n"
 
   return
 }
diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py
index 0293e8f276be6b..2f49e2e053999b 100644
--- a/mlir/test/python/dialects/gpu/dialect.py
+++ b/mlir/test/python/dialects/gpu/dialect.py
@@ -27,6 +27,6 @@ def testMMAElementWiseAttr():
     module = Module.create()
     with InsertionPoint(module.body):
         gpu.BlockDimOp(gpu.Dimension.y)
-    # CHECK: %0 = gpu.block_dim  y
+    # CHECK: %block_dim_y = gpu.block_dim  y
     print(module)
     pass
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index a1a5b7fe9bf406..16ceaadf276fa5 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -80,8 +80,8 @@ libc_support_library(
 )
 
 libc_support_library(
-    name = "__support_macros_properties_float",
-    hdrs = ["src/__support/macros/properties/float.h"],
+    name = "__support_macros_properties_types",
+    hdrs = ["src/__support/macros/properties/types.h"],
     deps = [
         ":__support_macros_properties_architectures",
         ":__support_macros_properties_compiler",
@@ -332,7 +332,7 @@ libc_support_library(
     deps = [
         ":__support_macros_attributes",
         ":__support_macros_config",
-        ":__support_macros_properties_float",
+        ":__support_macros_properties_types",
         ":llvm_libc_macros_stdfix_macros",
     ],
 )
@@ -697,7 +697,7 @@ libc_support_library(
         ":__support_cpp_type_traits",
         ":__support_libc_assert",
         ":__support_macros_attributes",
-        ":__support_macros_properties_float",
+        ":__support_macros_properties_types",
         ":__support_math_extras",
         ":__support_uint128",
     ],