diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 2194d268fa86f0..1079993f496945 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -672,16 +672,6 @@ class alignas(8) Decl {
   /// Whether this declaration comes from explicit global module.
   bool isFromExplicitGlobalModule() const;
 
-  /// Check if we should skip checking ODRHash for declaration \param D.
-  ///
-  /// The existing ODRHash mechanism seems to be not stable enough and
-  /// the false positive ODR violation reports are annoying and we rarely see
-  /// true ODR violation reports. Also we learned that MSVC disabled ODR checks
-  /// for declarations in GMF. So we try to disable ODR checks in the GMF to
-  /// get better user experiences before we make the ODR violation checks stable
-  /// enough.
-  bool shouldSkipCheckingODR() const;
-
   /// Return true if this declaration has an attribute which acts as
   /// definition of the entity, such as 'alias' or 'ifunc'.
   bool hasDefiningAttr() const;
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index e3fde887f99cb7..43ee06c524b3a0 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -2457,6 +2457,12 @@ class BitsUnpacker {
   uint32_t Value;
   uint32_t CurrentBitsIndex = ~0;
 };
+
+inline bool shouldSkipCheckingODR(const Decl *D) {
+  return D->getASTContext().getLangOpts().SkipODRCheckInGMF &&
+         D->isFromExplicitGlobalModule();
+}
+
 } // namespace clang
 
 #endif // LLVM_CLANG_SERIALIZATION_ASTREADER_H
diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
index fbbe9c32ce1258..dfc3beb6fa13ee 100644
--- a/clang/lib/APINotes/APINotesReader.cpp
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -30,23 +30,20 @@ namespace {
 llvm::VersionTuple ReadVersionTuple(const uint8_t *&Data) {
   uint8_t NumVersions = (*Data++) & 0x03;
 
-  unsigned Major =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+  unsigned Major = endian::readNext<uint32_t, llvm::endianness::little>(Data);
   if (NumVersions == 0)
     return llvm::VersionTuple(Major);
 
-  unsigned Minor =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+  unsigned Minor = endian::readNext<uint32_t, llvm::endianness::little>(Data);
   if (NumVersions == 1)
     return llvm::VersionTuple(Major, Minor);
 
   unsigned Subminor =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint32_t, llvm::endianness::little>(Data);
   if (NumVersions == 2)
     return llvm::VersionTuple(Major, Minor, Subminor);
 
-  unsigned Build =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+  unsigned Build = endian::readNext<uint32_t, llvm::endianness::little>(Data);
   return llvm::VersionTuple(Major, Minor, Subminor, Build);
 }
 
@@ -71,16 +68,16 @@ class VersionedTableInfo {
 
   static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
     unsigned KeyLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     unsigned DataLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     return {KeyLength, DataLength};
   }
 
   static data_type ReadData(internal_key_type Key, const uint8_t *Data,
                             unsigned Length) {
     unsigned NumElements =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     data_type Result;
     Result.reserve(NumElements);
     for (unsigned i = 0; i != NumElements; ++i) {
@@ -105,14 +102,14 @@ void ReadCommonEntityInfo(const uint8_t *&Data, CommonEntityInfo &Info) {
     Info.setSwiftPrivate(static_cast<bool>((UnavailableBits >> 3) & 0x01));
 
   unsigned MsgLength =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   Info.UnavailableMsg =
       std::string(reinterpret_cast<const char *>(Data),
                   reinterpret_cast<const char *>(Data) + MsgLength);
   Data += MsgLength;
 
   unsigned SwiftNameLength =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   Info.SwiftName =
       std::string(reinterpret_cast<const char *>(Data),
                   reinterpret_cast<const char *>(Data) + SwiftNameLength);
@@ -124,7 +121,7 @@ void ReadCommonTypeInfo(const uint8_t *&Data, CommonTypeInfo &Info) {
   ReadCommonEntityInfo(Data, Info);
 
   unsigned SwiftBridgeLength =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   if (SwiftBridgeLength > 0) {
     Info.setSwiftBridge(std::string(reinterpret_cast<const char *>(Data),
                                     SwiftBridgeLength - 1));
@@ -132,7 +129,7 @@ void ReadCommonTypeInfo(const uint8_t *&Data, CommonTypeInfo &Info) {
   }
 
   unsigned ErrorDomainLength =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   if (ErrorDomainLength > 0) {
     Info.setNSErrorDomain(std::optional<std::string>(std::string(
         reinterpret_cast<const char *>(Data), ErrorDomainLength - 1)));
@@ -163,9 +160,9 @@ class IdentifierTableInfo {
 
   static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
     unsigned KeyLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     unsigned DataLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     return {KeyLength, DataLength};
   }
 
@@ -175,8 +172,7 @@ class IdentifierTableInfo {
 
   static data_type ReadData(internal_key_type key, const uint8_t *Data,
                             unsigned Length) {
-    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-        Data);
+    return endian::readNext<uint32_t, llvm::endianness::little>(Data);
   }
 };
 
@@ -203,26 +199,24 @@ class ObjCContextIDTableInfo {
 
   static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
     unsigned KeyLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     unsigned DataLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     return {KeyLength, DataLength};
   }
 
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
     auto ParentCtxID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
-    auto NameID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     return {ParentCtxID, ContextKind, NameID};
   }
 
   static data_type ReadData(internal_key_type Key, const uint8_t *Data,
                             unsigned Length) {
-    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-        Data);
+    return endian::readNext<uint32_t, llvm::endianness::little>(Data);
   }
 };
 
@@ -232,8 +226,7 @@ class ObjCContextInfoTableInfo
                                 ObjCContextInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-        Data);
+    return endian::readNext<uint32_t, llvm::endianness::little>(Data);
   }
 
   hash_value_type ComputeHash(internal_key_type Key) {
@@ -273,8 +266,7 @@ void ReadVariableInfo(const uint8_t *&Data, VariableInfo &Info) {
   }
   ++Data;
 
-  auto TypeLen =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+  auto TypeLen = endian::readNext<uint16_t, llvm::endianness::little>(Data);
   Info.setType(std::string(Data, Data + TypeLen));
   Data += TypeLen;
 }
@@ -286,12 +278,9 @@ class ObjCPropertyTableInfo
                                 ObjCPropertyInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto ClassID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
-    auto NameID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
-    char IsInstance =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+    auto ClassID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
+    char IsInstance = endian::readNext<uint8_t, llvm::endianness::little>(Data);
     return {ClassID, NameID, IsInstance};
   }
 
@@ -314,8 +303,7 @@ class ObjCPropertyTableInfo
 void ReadParamInfo(const uint8_t *&Data, ParamInfo &Info) {
   ReadVariableInfo(Data, Info);
 
-  uint8_t Payload =
-      endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+  uint8_t Payload = endian::readNext<uint8_t, llvm::endianness::little>(Data);
   if (auto RawConvention = Payload & 0x7) {
     auto Convention = static_cast<RetainCountConventionKind>(RawConvention - 1);
     Info.setRetainCountConvention(Convention);
@@ -331,8 +319,7 @@ void ReadParamInfo(const uint8_t *&Data, ParamInfo &Info) {
 void ReadFunctionInfo(const uint8_t *&Data, FunctionInfo &Info) {
   ReadCommonEntityInfo(Data, Info);
 
-  uint8_t Payload =
-      endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+  uint8_t Payload = endian::readNext<uint8_t, llvm::endianness::little>(Data);
   if (auto RawConvention = Payload & 0x7) {
     auto Convention = static_cast<RetainCountConventionKind>(RawConvention - 1);
     Info.setRetainCountConvention(Convention);
@@ -343,12 +330,12 @@ void ReadFunctionInfo(const uint8_t *&Data, FunctionInfo &Info) {
   assert(Payload == 0 && "Bad API notes");
 
   Info.NumAdjustedNullable =
-      endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint8_t, llvm::endianness::little>(Data);
   Info.NullabilityPayload =
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint64_t, llvm::endianness::little>(Data);
 
   unsigned NumParams =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   while (NumParams > 0) {
     ParamInfo pi;
     ReadParamInfo(Data, pi);
@@ -357,7 +344,7 @@ void ReadFunctionInfo(const uint8_t *&Data, FunctionInfo &Info) {
   }
 
   unsigned ResultTypeLen =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   Info.ResultType = std::string(Data, Data + ResultTypeLen);
   Data += ResultTypeLen;
 }
@@ -369,12 +356,10 @@ class ObjCMethodTableInfo
                                 ObjCMethodInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto ClassID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto ClassID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto SelectorID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
-    auto IsInstance =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
+    auto IsInstance = endian::readNext<uint8_t, llvm::endianness::little>(Data);
     return {ClassID, SelectorID, IsInstance};
   }
 
@@ -419,29 +404,26 @@ class ObjCSelectorTableInfo {
 
   static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
     unsigned KeyLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     unsigned DataLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     return {KeyLength, DataLength};
   }
 
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
     internal_key_type Key;
-    Key.NumArgs =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    Key.NumArgs = endian::readNext<uint16_t, llvm::endianness::little>(Data);
     unsigned NumIdents = (Length - sizeof(uint16_t)) / sizeof(uint32_t);
     for (unsigned i = 0; i != NumIdents; ++i) {
       Key.Identifiers.push_back(
-          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-              Data));
+          endian::readNext<uint32_t, llvm::endianness::little>(Data));
     }
     return Key;
   }
 
   static data_type ReadData(internal_key_type Key, const uint8_t *Data,
                             unsigned Length) {
-    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-        Data);
+    return endian::readNext<uint32_t, llvm::endianness::little>(Data);
   }
 };
 
@@ -451,12 +433,10 @@ class GlobalVariableTableInfo
                                 GlobalVariableInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto CtxID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
-    auto NameID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     return {CtxID, ContextKind, NameID};
   }
 
@@ -478,12 +458,10 @@ class GlobalFunctionTableInfo
                                 GlobalFunctionInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto CtxID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
-    auto NameID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     return {CtxID, ContextKind, NameID};
   }
 
@@ -505,8 +483,7 @@ class EnumConstantTableInfo
                                 EnumConstantInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto NameID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     return NameID;
   }
 
@@ -527,13 +504,11 @@ class TagTableInfo
     : public VersionedTableInfo<TagTableInfo, ContextTableKey, TagInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto CtxID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
     auto NameID =
-        endian::readNext<IdentifierID, llvm::endianness::little, unaligned>(
-            Data);
+        endian::readNext<IdentifierID, llvm::endianness::little>(Data);
     return {CtxID, ContextKind, NameID};
   }
 
@@ -553,21 +528,21 @@ class TagTableInfo
           static_cast<EnumExtensibilityKind>((Payload & 0x3) - 1);
 
     unsigned ImportAsLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     if (ImportAsLength > 0) {
       Info.SwiftImportAs =
           std::string(reinterpret_cast<const char *>(Data), ImportAsLength - 1);
       Data += ImportAsLength - 1;
     }
     unsigned RetainOpLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     if (RetainOpLength > 0) {
       Info.SwiftRetainOp =
           std::string(reinterpret_cast<const char *>(Data), RetainOpLength - 1);
       Data += RetainOpLength - 1;
     }
     unsigned ReleaseOpLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     if (ReleaseOpLength > 0) {
       Info.SwiftReleaseOp = std::string(reinterpret_cast<const char *>(Data),
                                         ReleaseOpLength - 1);
@@ -585,13 +560,11 @@ class TypedefTableInfo
                                 TypedefInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto CtxID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
     auto nameID =
-        endian::readNext<IdentifierID, llvm::endianness::little, unaligned>(
-            Data);
+        endian::readNext<IdentifierID, llvm::endianness::little>(Data);
     return {CtxID, ContextKind, nameID};
   }
 
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 2b2d5a2663a18b..33b6f8611f2162 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -4534,7 +4534,7 @@ unsigned FunctionDecl::getODRHash() {
   }
 
   class ODRHash Hash;
-  Hash.AddFunctionDecl(this, /*SkipBody=*/shouldSkipCheckingODR());
+  Hash.AddFunctionDecl(this);
   setHasODRHash(true);
   ODRHash = Hash.CalculateHash();
   return ODRHash;
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 66a727d9dd0c39..434926324c96ca 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1106,11 +1106,6 @@ bool Decl::isFromExplicitGlobalModule() const {
   return getOwningModule() && getOwningModule()->isExplicitGlobalModule();
 }
 
-bool Decl::shouldSkipCheckingODR() const {
-  return getASTContext().getLangOpts().SkipODRCheckInGMF &&
-         isFromExplicitGlobalModule();
-}
-
 static Decl::Kind getKind(const Decl *D) { return D->getKind(); }
 static Decl::Kind getKind(const DeclContext *DC) { return DC->getDeclKind(); }
 
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 01ec31e4077f70..a069f3ec27e721 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -262,7 +262,7 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
       return this->discard(SubExpr);
 
     std::optional<PrimType> FromT = classify(SubExpr->getType());
-    std::optional<PrimType> ToT = classifyPrim(CE->getType());
+    std::optional<PrimType> ToT = classify(CE->getType());
     if (!FromT || !ToT)
       return false;
 
@@ -1251,6 +1251,15 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
     return this->emitConst(Size.getQuantity(), E);
   }
 
+  if (Kind == UETT_VectorElements) {
+    if (const auto *VT = E->getTypeOfArgument()->getAs<VectorType>())
+      return this->emitConst(VT->getNumElements(), E);
+
+    // FIXME: Apparently we need to catch the fact that a sizeless vector type
+    // has been passed and diagnose that (at run time).
+    assert(E->getTypeOfArgument()->isSizelessVectorType());
+  }
+
   return false;
 }
 
@@ -1258,10 +1267,19 @@ template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitMemberExpr(const MemberExpr *E) {
   // 'Base.Member'
   const Expr *Base = E->getBase();
+  const ValueDecl *Member = E->getMemberDecl();
 
   if (DiscardResult)
     return this->discard(Base);
 
+  if (const auto *VD = dyn_cast<VarDecl>(Member)) {
+    // I am almost confident in saying that a var decl must be static
+    // and therefore registered as a global variable. But this will probably
+    // turn out to be wrong some time in the future, as always.
+    if (auto GlobalIndex = P.getGlobal(VD))
+      return this->emitGetPtrGlobal(*GlobalIndex, E);
+  }
+
   if (Initializing) {
     if (!this->delegate(Base))
       return false;
@@ -1271,8 +1289,6 @@ bool ByteCodeExprGen<Emitter>::VisitMemberExpr(const MemberExpr *E) {
   }
 
   // Base above gives us a pointer on the stack.
-  // TODO: Implement non-FieldDecl members.
-  const ValueDecl *Member = E->getMemberDecl();
   if (const auto *FD = dyn_cast<FieldDecl>(Member)) {
     const RecordDecl *RD = FD->getParent();
     const Record *R = getRecord(RD);
@@ -1615,7 +1631,7 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundAssignOperator(
     return false;
   if (!this->emitLoad(*LT, E))
     return false;
-  if (*LT != *LHSComputationT) {
+  if (LT != LHSComputationT) {
     if (!this->emitCast(*LT, *LHSComputationT, E))
       return false;
   }
@@ -1671,7 +1687,7 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundAssignOperator(
   }
 
   // And now cast from LHSComputationT to ResultT.
-  if (*ResultT != *LHSComputationT) {
+  if (ResultT != LHSComputationT) {
     if (!this->emitCast(*LHSComputationT, *ResultT, E))
       return false;
   }
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index 022b394e58e643..ebc4e4f195ba62 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -264,3 +264,19 @@ LLVM_DUMP_METHOD void Record::dump(llvm::raw_ostream &OS, unsigned Indentation,
     ++I;
   }
 }
+
+LLVM_DUMP_METHOD void Block::dump(llvm::raw_ostream &OS) const {
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_BLUE, true});
+    OS << "Block " << (void *)this << "\n";
+  }
+  unsigned NPointers = 0;
+  for (const Pointer *P = Pointers; P; P = P->Next) {
+    ++NPointers;
+  }
+  OS << "  Pointers: " << NPointers << "\n";
+  OS << "  Dead: " << IsDead << "\n";
+  OS << "  Static: " << IsStatic << "\n";
+  OS << "  Extern: " << IsExtern << "\n";
+  OS << "  Initialized: " << IsInitialized << "\n";
+}
diff --git a/clang/lib/AST/Interp/FunctionPointer.h b/clang/lib/AST/Interp/FunctionPointer.h
index c2ea295b82bdf5..fc3d7a4214a72b 100644
--- a/clang/lib/AST/Interp/FunctionPointer.h
+++ b/clang/lib/AST/Interp/FunctionPointer.h
@@ -32,6 +32,7 @@ class FunctionPointer final {
 
   const Function *getFunction() const { return Func; }
   bool isZero() const { return !Func; }
+  bool isValid() const { return Valid; }
   bool isWeak() const {
     if (!Func || !Valid)
       return false;
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 4182254357eb9a..dd0bacd73acb10 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -2236,6 +2236,10 @@ inline bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize,
         << const_cast<Expr *>(E) << E->getSourceRange();
     return false;
   }
+
+  if (!FuncPtr.isValid())
+    return false;
+
   assert(F);
 
   // Check argument nullability state.
diff --git a/clang/lib/AST/Interp/InterpBlock.h b/clang/lib/AST/Interp/InterpBlock.h
index 9db82567d2d5d6..6d5856fbd4ea19 100644
--- a/clang/lib/AST/Interp/InterpBlock.h
+++ b/clang/lib/AST/Interp/InterpBlock.h
@@ -118,6 +118,9 @@ class Block final {
     IsInitialized = false;
   }
 
+  void dump() const { dump(llvm::errs()); }
+  void dump(llvm::raw_ostream &OS) const;
+
 protected:
   friend class Pointer;
   friend class DeadBlock;
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index fcd00aac62f93e..b4475577b74625 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -241,13 +241,10 @@ class Pointer {
 
   /// Checks if the pointer is null.
   bool isZero() const {
-    if (Offset != 0)
-      return false;
-
     if (isBlockPointer())
       return asBlockPointer().Pointee == nullptr;
     assert(isIntegralPointer());
-    return asIntPointer().Value == 0;
+    return asIntPointer().Value == 0 && Offset == 0;
   }
   /// Checks if the pointer is live.
   bool isLive() const {
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index d2aac1e640380f..789e4634bd293b 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2071,13 +2071,31 @@ StmtProfiler::VisitLambdaExpr(const LambdaExpr *S) {
   }
 
   CXXRecordDecl *Lambda = S->getLambdaClass();
-  ID.AddInteger(Lambda->getODRHash());
-
   for (const auto &Capture : Lambda->captures()) {
     ID.AddInteger(Capture.getCaptureKind());
     if (Capture.capturesVariable())
       VisitDecl(Capture.getCapturedVar());
   }
+
+  // Profiling the body of the lambda may be dangerous during deserialization.
+  // So we'd like only to profile the signature here.
+  ODRHash Hasher;
+  // FIXME: We can't get the operator call easily by
+  // `CXXRecordDecl::getLambdaCallOperator()` if we're in deserialization.
+  // So we have to do something raw here.
+  for (auto *SubDecl : Lambda->decls()) {
+    FunctionDecl *Call = nullptr;
+    if (auto *FTD = dyn_cast<FunctionTemplateDecl>(SubDecl))
+      Call = FTD->getTemplatedDecl();
+    else if (auto *FD = dyn_cast<FunctionDecl>(SubDecl))
+      Call = FD;
+
+    if (!Call)
+      continue;
+
+    Hasher.AddFunctionDecl(Call, /*SkipBody=*/true);
+  }
+  ID.AddInteger(Hasher.CalculateHash());
 }
 
 void
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index bea15ce9bd24d1..ee2581143e1141 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -508,6 +508,11 @@ class ResultObjectVisitor : public RecursiveASTVisitor<ResultObjectVisitor> {
         isa<CXXStdInitializerListExpr>(E)) {
       return;
     }
+    if (auto *Op = dyn_cast<BinaryOperator>(E);
+        Op && Op->getOpcode() == BO_Cmp) {
+      // Builtin `<=>` returns a `std::strong_ordering` object.
+      return;
+    }
 
     if (auto *InitList = dyn_cast<InitListExpr>(E)) {
       if (!InitList->isSemanticForm())
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index e25991e3dfe821..9a4a8b501460b6 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -259,7 +259,7 @@ class LLVM_LIBRARY_VISIBILITY SPIR32TargetInfo : public SPIRTargetInfo {
     SizeType = TargetInfo::UnsignedInt;
     PtrDiffType = IntPtrType = TargetInfo::SignedInt;
     resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -276,7 +276,7 @@ class LLVM_LIBRARY_VISIBILITY SPIR64TargetInfo : public SPIRTargetInfo {
     SizeType = TargetInfo::UnsignedLong;
     PtrDiffType = IntPtrType = TargetInfo::SignedLong;
     resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -336,7 +336,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRV32TargetInfo : public BaseSPIRVTargetInfo {
     SizeType = TargetInfo::UnsignedInt;
     PtrDiffType = IntPtrType = TargetInfo::SignedInt;
     resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -357,7 +357,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64TargetInfo : public BaseSPIRVTargetInfo {
     SizeType = TargetInfo::UnsignedLong;
     PtrDiffType = IntPtrType = TargetInfo::SignedLong;
     resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 4adcb0f6eca565..01f9b840cdc91b 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4708,11 +4708,11 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
     AggValueSlot Slot = args.isUsingInAlloca()
         ? createPlaceholderSlot(*this, type) : CreateAggTemp(type, "agg.tmp");
 
-    bool DestroyedInCallee = true, NeedsEHCleanup = true;
+    bool DestroyedInCallee = true, NeedsCleanup = true;
     if (const auto *RD = type->getAsCXXRecordDecl())
       DestroyedInCallee = RD->hasNonTrivialDestructor();
     else
-      NeedsEHCleanup = needsEHCleanup(type.isDestructedType());
+      NeedsCleanup = type.isDestructedType();
 
     if (DestroyedInCallee)
       Slot.setExternallyDestructed();
@@ -4721,14 +4721,15 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
     RValue RV = Slot.asRValue();
     args.add(RV, type);
 
-    if (DestroyedInCallee && NeedsEHCleanup) {
+    if (DestroyedInCallee && NeedsCleanup) {
       // Create a no-op GEP between the placeholder and the cleanup so we can
       // RAUW it successfully.  It also serves as a marker of the first
       // instruction where the cleanup is active.
-      pushFullExprCleanup<DestroyUnpassedArg>(EHCleanup, Slot.getAddress(),
-                                              type);
+      pushFullExprCleanup<DestroyUnpassedArg>(NormalAndEHCleanup,
+                                              Slot.getAddress(), type);
       // This unreachable is a temporary marker which will be removed later.
-      llvm::Instruction *IsActive = Builder.CreateUnreachable();
+      llvm::Instruction *IsActive =
+          Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
       args.addArgCleanupDeactivation(EHStack.stable_begin(), IsActive);
     }
     return;
diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp
index 5bf48bc22a5495..8683f19d9da28e 100644
--- a/clang/lib/CodeGen/CGCleanup.cpp
+++ b/clang/lib/CodeGen/CGCleanup.cpp
@@ -634,12 +634,19 @@ static void destroyOptimisticNormalEntry(CodeGenFunction &CGF,
 /// Pops a cleanup block.  If the block includes a normal cleanup, the
 /// current insertion point is threaded through the cleanup, as are
 /// any branch fixups on the cleanup.
-void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
+void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough,
+                                      bool ForDeactivation) {
   assert(!EHStack.empty() && "cleanup stack is empty!");
   assert(isa<EHCleanupScope>(*EHStack.begin()) && "top not a cleanup!");
   EHCleanupScope &Scope = cast<EHCleanupScope>(*EHStack.begin());
   assert(Scope.getFixupDepth() <= EHStack.getNumBranchFixups());
 
+  // If we are deactivating a normal cleanup, we need to pretend that the
+  // fallthrough is unreachable. We restore this IP before returning.
+  CGBuilderTy::InsertPoint NormalDeactivateOrigIP;
+  if (ForDeactivation && (Scope.isNormalCleanup() || !getLangOpts().EHAsynch)) {
+    NormalDeactivateOrigIP = Builder.saveAndClearIP();
+  }
   // Remember activation information.
   bool IsActive = Scope.isActive();
   Address NormalActiveFlag =
@@ -729,6 +736,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
     EHStack.popCleanup(); // safe because there are no fixups
     assert(EHStack.getNumBranchFixups() == 0 ||
            EHStack.hasNormalCleanups());
+    if (NormalDeactivateOrigIP.isSet())
+      Builder.restoreIP(NormalDeactivateOrigIP);
     return;
   }
 
@@ -765,9 +774,16 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
   if (!RequiresNormalCleanup) {
     // Mark CPP scope end for passed-by-value Arg temp
     //   per Windows ABI which is "normally" Cleanup in callee
-    if (IsEHa && getInvokeDest() && Builder.GetInsertBlock()) {
-      if (Personality.isMSVCXXPersonality())
+    if (IsEHa && getInvokeDest()) {
+      // If we are deactivating a normal cleanup then we don't have a
+      // fallthrough. Restore original IP to emit CPP scope ends in the correct
+      // block.
+      if (NormalDeactivateOrigIP.isSet())
+        Builder.restoreIP(NormalDeactivateOrigIP);
+      if (Personality.isMSVCXXPersonality() && Builder.GetInsertBlock())
         EmitSehCppScopeEnd();
+      if (NormalDeactivateOrigIP.isSet())
+        NormalDeactivateOrigIP = Builder.saveAndClearIP();
     }
     destroyOptimisticNormalEntry(*this, Scope);
     Scope.MarkEmitted();
@@ -992,6 +1008,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
     }
   }
 
+  if (NormalDeactivateOrigIP.isSet())
+    Builder.restoreIP(NormalDeactivateOrigIP);
   assert(EHStack.hasNormalCleanups() || EHStack.getNumBranchFixups() == 0);
 
   // Emit the EH cleanup if required.
@@ -1281,17 +1299,8 @@ void CodeGenFunction::DeactivateCleanupBlock(EHScopeStack::stable_iterator C,
   // to the current RunCleanupsScope.
   if (C == EHStack.stable_begin() &&
       CurrentCleanupScopeDepth.strictlyEncloses(C)) {
-    // Per comment below, checking EHAsynch is not really necessary
-    // it's there to assure zero-impact w/o EHAsynch option
-    if (!Scope.isNormalCleanup() && getLangOpts().EHAsynch) {
-      PopCleanupBlock();
-    } else {
-      // If it's a normal cleanup, we need to pretend that the
-      // fallthrough is unreachable.
-      CGBuilderTy::InsertPoint SavedIP = Builder.saveAndClearIP();
-      PopCleanupBlock();
-      Builder.restoreIP(SavedIP);
-    }
+    PopCleanupBlock(/*FallthroughIsBranchThrough=*/false,
+                    /*ForDeactivation=*/true);
     return;
   }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index f6a944ebd6d3bc..46e9021ac0b29b 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -957,7 +957,8 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   /// PopCleanupBlock - Will pop the cleanup entry on the stack and
   /// process all branch fixups.
-  void PopCleanupBlock(bool FallThroughIsBranchThrough = false);
+  void PopCleanupBlock(bool FallThroughIsBranchThrough = false,
+                       bool ForDeactivation = false);
 
   /// DeactivateCleanupBlock - Deactivates the given cleanup block.
   /// The block cannot be reactivated.  Pops it if it's the top of the
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 9057a742df9cf1..e8b6fed3909e77 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -3961,9 +3961,20 @@ bool CodeGenModule::shouldEmitFunction(GlobalDecl GD) {
   // behavior may break ABI compatibility of the current unit.
   if (const Module *M = F->getOwningModule();
       M && M->getTopLevelModule()->isNamedModule() &&
-      getContext().getCurrentNamedModule() != M->getTopLevelModule() &&
-      !F->hasAttr<AlwaysInlineAttr>())
-    return false;
+      getContext().getCurrentNamedModule() != M->getTopLevelModule()) {
+    // There are practices to mark template member function as always-inline
+    // and mark the template as extern explicit instantiation but not give
+    // the definition for member function. So we have to emit the function
+    // from explicitly instantiation with always-inline.
+    //
+    // See https://github.com/llvm/llvm-project/issues/86893 for details.
+    //
+    // TODO: Maybe it is better to give it a warning if we call a non-inline
+    // function from other module units which is marked as always-inline.
+    if (!F->isTemplateInstantiation() || !F->hasAttr<AlwaysInlineAttr>()) {
+      return false;
+    }
+  }
 
   if (F->hasAttr<NoInlineAttr>())
     return false;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 9d66388cffb72d..6a8761160332cd 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -347,11 +347,14 @@ static bool addExceptionArgs(const ArgList &Args, types::ID InputType,
   bool EH = Args.hasFlag(options::OPT_fexceptions, options::OPT_fno_exceptions,
                          false);
 
-  bool EHa = Args.hasFlag(options::OPT_fasync_exceptions,
-                          options::OPT_fno_async_exceptions, false);
-  if (EHa) {
-    CmdArgs.push_back("-fasync-exceptions");
-    EH = true;
+  // Async exceptions are Windows MSVC only.
+  if (Triple.isWindowsMSVCEnvironment()) {
+    bool EHa = Args.hasFlag(options::OPT_fasync_exceptions,
+                            options::OPT_fno_async_exceptions, false);
+    if (EHa) {
+      CmdArgs.push_back("-fasync-exceptions");
+      EH = true;
+    }
   }
 
   // Obj-C exceptions are enabled by default, regardless of -fexceptions. This
@@ -8260,7 +8263,8 @@ struct EHFlags {
 ///      The 'a' modifier is unimplemented and fundamentally hard in LLVM IR.
 /// - c: Assume that extern "C" functions are implicitly nounwind.
 /// The default is /EHs-c-, meaning cleanups are disabled.
-static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args) {
+static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args,
+                                   bool isWindowsMSVC) {
   EHFlags EH;
 
   std::vector<std::string> EHArgs =
@@ -8270,8 +8274,15 @@ static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args) {
       switch (EHVal[I]) {
       case 'a':
         EH.Asynch = maybeConsumeDash(EHVal, I);
-        if (EH.Asynch)
+        if (EH.Asynch) {
+          // Async exceptions are Windows MSVC only.
+          if (!isWindowsMSVC) {
+            EH.Asynch = false;
+            D.Diag(clang::diag::warn_drv_unused_argument) << "/EHa" << EHVal;
+            continue;
+          }
           EH.Synch = false;
+        }
         continue;
       case 'c':
         EH.NoUnwindC = maybeConsumeDash(EHVal, I);
@@ -8335,7 +8346,8 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
 
   const Driver &D = getToolChain().getDriver();
 
-  EHFlags EH = parseClangCLEHFlags(D, Args);
+  bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment();
+  EHFlags EH = parseClangCLEHFlags(D, Args, IsWindowsMSVC);
   if (!isNVPTX && (EH.Synch || EH.Asynch)) {
     if (types::isCXX(InputType))
       CmdArgs.push_back("-fcxx-exceptions");
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 8c4b460970ad2b..f47d540ea4b86d 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -915,10 +915,9 @@ ASTSelectorLookupTrait::ReadKey(const unsigned char* d, unsigned) {
   using namespace llvm::support;
 
   SelectorTable &SelTable = Reader.getContext().Selectors;
-  unsigned N =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
+  unsigned N = endian::readNext<uint16_t, llvm::endianness::little>(d);
   const IdentifierInfo *FirstII = Reader.getLocalIdentifier(
-      F, endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d));
+      F, endian::readNext<uint32_t, llvm::endianness::little>(d));
   if (N == 0)
     return SelTable.getNullarySelector(FirstII);
   else if (N == 1)
@@ -928,7 +927,7 @@ ASTSelectorLookupTrait::ReadKey(const unsigned char* d, unsigned) {
   Args.push_back(FirstII);
   for (unsigned I = 1; I != N; ++I)
     Args.push_back(Reader.getLocalIdentifier(
-        F, endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d)));
+        F, endian::readNext<uint32_t, llvm::endianness::little>(d)));
 
   return SelTable.getSelector(N, Args.data());
 }
@@ -941,11 +940,11 @@ ASTSelectorLookupTrait::ReadData(Selector, const unsigned char* d,
   data_type Result;
 
   Result.ID = Reader.getGlobalSelectorID(
-      F, endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d));
+      F, endian::readNext<uint32_t, llvm::endianness::little>(d));
   unsigned FullInstanceBits =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
+      endian::readNext<uint16_t, llvm::endianness::little>(d);
   unsigned FullFactoryBits =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
+      endian::readNext<uint16_t, llvm::endianness::little>(d);
   Result.InstanceBits = FullInstanceBits & 0x3;
   Result.InstanceHasMoreThanOneDecl = (FullInstanceBits >> 2) & 0x1;
   Result.FactoryBits = FullFactoryBits & 0x3;
@@ -956,16 +955,14 @@ ASTSelectorLookupTrait::ReadData(Selector, const unsigned char* d,
   // Load instance methods
   for (unsigned I = 0; I != NumInstanceMethods; ++I) {
     if (ObjCMethodDecl *Method = Reader.GetLocalDeclAs<ObjCMethodDecl>(
-            F,
-            endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d)))
+            F, endian::readNext<uint32_t, llvm::endianness::little>(d)))
       Result.Instance.push_back(Method);
   }
 
   // Load factory methods
   for (unsigned I = 0; I != NumFactoryMethods; ++I) {
     if (ObjCMethodDecl *Method = Reader.GetLocalDeclAs<ObjCMethodDecl>(
-            F,
-            endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d)))
+            F, endian::readNext<uint32_t, llvm::endianness::little>(d)))
       Result.Factory.push_back(Method);
   }
 
@@ -1009,8 +1006,7 @@ static bool readBit(unsigned &Bits) {
 IdentID ASTIdentifierLookupTrait::ReadIdentifierID(const unsigned char *d) {
   using namespace llvm::support;
 
-  unsigned RawID =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+  unsigned RawID = endian::readNext<uint32_t, llvm::endianness::little>(d);
   return Reader.getGlobalIdentifierID(F, RawID >> 1);
 }
 
@@ -1028,8 +1024,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
                                                    unsigned DataLen) {
   using namespace llvm::support;
 
-  unsigned RawID =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+  unsigned RawID = endian::readNext<uint32_t, llvm::endianness::little>(d);
   bool IsInteresting = RawID & 0x01;
 
   // Wipe out the "is interesting" bit.
@@ -1053,9 +1048,8 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
   }
 
   unsigned ObjCOrBuiltinID =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
-  unsigned Bits =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
+      endian::readNext<uint16_t, llvm::endianness::little>(d);
+  unsigned Bits = endian::readNext<uint16_t, llvm::endianness::little>(d);
   bool CPlusPlusOperatorKeyword = readBit(Bits);
   bool HasRevertedTokenIDToIdentifier = readBit(Bits);
   bool Poisoned = readBit(Bits);
@@ -1084,7 +1078,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
   // definition.
   if (HadMacroDefinition) {
     uint32_t MacroDirectivesOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+        endian::readNext<uint32_t, llvm::endianness::little>(d);
     DataLen -= 4;
 
     Reader.addPendingMacro(II, &F, MacroDirectivesOffset);
@@ -1098,8 +1092,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
     SmallVector<uint32_t, 4> DeclIDs;
     for (; DataLen > 0; DataLen -= 4)
       DeclIDs.push_back(Reader.getGlobalDeclID(
-          F,
-          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d)));
+          F, endian::readNext<uint32_t, llvm::endianness::little>(d)));
     Reader.SetGloballyVisibleDecls(II, DeclIDs);
   }
 
@@ -1169,7 +1162,7 @@ ASTDeclContextNameLookupTrait::ReadFileRef(const unsigned char *&d) {
   using namespace llvm::support;
 
   uint32_t ModuleFileID =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+      endian::readNext<uint32_t, llvm::endianness::little>(d);
   return Reader.getLocalModuleFile(F, ModuleFileID);
 }
 
@@ -1189,18 +1182,15 @@ ASTDeclContextNameLookupTrait::ReadKey(const unsigned char *d, unsigned) {
   case DeclarationName::CXXLiteralOperatorName:
   case DeclarationName::CXXDeductionGuideName:
     Data = (uint64_t)Reader.getLocalIdentifier(
-        F, endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d));
+        F, endian::readNext<uint32_t, llvm::endianness::little>(d));
     break;
   case DeclarationName::ObjCZeroArgSelector:
   case DeclarationName::ObjCOneArgSelector:
   case DeclarationName::ObjCMultiArgSelector:
-    Data =
-        (uint64_t)Reader
-            .getLocalSelector(
-                F,
-                endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-                    d))
-            .getAsOpaquePtr();
+    Data = (uint64_t)Reader
+               .getLocalSelector(
+                   F, endian::readNext<uint32_t, llvm::endianness::little>(d))
+               .getAsOpaquePtr();
     break;
   case DeclarationName::CXXOperatorName:
     Data = *d++; // OverloadedOperatorKind
@@ -1223,8 +1213,7 @@ void ASTDeclContextNameLookupTrait::ReadDataInto(internal_key_type,
   using namespace llvm::support;
 
   for (unsigned NumDecls = DataLen / 4; NumDecls; --NumDecls) {
-    uint32_t LocalID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+    uint32_t LocalID = endian::readNext<uint32_t, llvm::endianness::little>(d);
     Val.insert(Reader.getGlobalDeclID(F, LocalID));
   }
 }
@@ -2033,10 +2022,9 @@ HeaderFileInfoTrait::ReadKey(const unsigned char *d, unsigned) {
   using namespace llvm::support;
 
   internal_key_type ikey;
-  ikey.Size =
-      off_t(endian::readNext<uint64_t, llvm::endianness::little, unaligned>(d));
-  ikey.ModTime = time_t(
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(d));
+  ikey.Size = off_t(endian::readNext<uint64_t, llvm::endianness::little>(d));
+  ikey.ModTime =
+      time_t(endian::readNext<uint64_t, llvm::endianness::little>(d));
   ikey.Filename = (const char *)d;
   ikey.Imported = true;
   return ikey;
@@ -2064,9 +2052,9 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d,
   HFI.DirInfo = (Flags >> 1) & 0x07;
   HFI.IndexHeaderMapHeader = Flags & 0x01;
   HFI.ControllingMacroID = Reader.getGlobalIdentifierID(
-      M, endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d));
+      M, endian::readNext<uint32_t, llvm::endianness::little>(d));
   if (unsigned FrameworkOffset =
-          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d)) {
+          endian::readNext<uint32_t, llvm::endianness::little>(d)) {
     // The framework offset is 1 greater than the actual offset,
     // since 0 is used as an indicator for "no framework name".
     StringRef FrameworkName(FrameworkStrings + FrameworkOffset - 1);
@@ -2077,7 +2065,7 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d,
          "Wrong data length in HeaderFileInfo deserialization");
   while (d != End) {
     uint32_t LocalSMID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+        endian::readNext<uint32_t, llvm::endianness::little>(d);
     auto HeaderRole = static_cast<ModuleMap::ModuleHeaderRole>(LocalSMID & 7);
     LocalSMID >>= 3;
 
@@ -4085,9 +4073,8 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
     // how it goes...
     using namespace llvm::support;
     ModuleKind Kind = static_cast<ModuleKind>(
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data));
-    uint16_t Len =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data));
+    uint16_t Len = endian::readNext<uint16_t, llvm::endianness::little>(Data);
     StringRef Name = StringRef((const char*)Data, Len);
     Data += Len;
     ModuleFile *OM = (Kind == MK_PrebuiltModule || Kind == MK_ExplicitModule ||
@@ -4103,21 +4090,21 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
     }
 
     SourceLocation::UIntTy SLocOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t IdentifierIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t MacroIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t PreprocessedEntityIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t SubmoduleIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t SelectorIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t DeclIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t TypeIndexOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
 
     auto mapOffset = [&](uint32_t Offset, uint32_t BaseOffset,
                          RemapBuilder &Remap) {
@@ -9798,7 +9785,7 @@ void ASTReader::finishPendingActions() {
             !NonConstDefn->isLateTemplateParsed() &&
             // We only perform ODR checks for decls not in the explicit
             // global module fragment.
-            !FD->shouldSkipCheckingODR() &&
+            !shouldSkipCheckingODR(FD) &&
             FD->getODRHash() != NonConstDefn->getODRHash()) {
           if (!isa<CXXMethodDecl>(FD)) {
             PendingFunctionOdrMergeFailures[FD].push_back(NonConstDefn);
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index e4b6a75c118ba3..74d40f7da34cad 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -826,7 +826,7 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) {
       Reader.mergeDefinitionVisibility(OldDef, ED);
       // We don't want to check the ODR hash value for declarations from global
       // module fragment.
-      if (!ED->shouldSkipCheckingODR() &&
+      if (!shouldSkipCheckingODR(ED) &&
           OldDef->getODRHash() != ED->getODRHash())
         Reader.PendingEnumOdrMergeFailures[OldDef].push_back(ED);
     } else {
@@ -868,7 +868,7 @@ void ASTDeclReader::VisitRecordDecl(RecordDecl *RD) {
   VisitRecordDeclImpl(RD);
   // We should only reach here if we're in C/Objective-C. There is no
   // global module fragment.
-  assert(!RD->shouldSkipCheckingODR());
+  assert(!shouldSkipCheckingODR(RD));
   RD->setODRHash(Record.readInt());
 
   // Maintain the invariant of a redeclaration chain containing only
@@ -2155,7 +2155,7 @@ void ASTDeclReader::MergeDefinitionData(
   }
 
   // We don't want to check ODR for decls in the global module fragment.
-  if (MergeDD.Definition->shouldSkipCheckingODR())
+  if (shouldSkipCheckingODR(MergeDD.Definition))
     return;
 
   if (D->getODRHash() != MergeDD.ODRHash) {
@@ -3530,7 +3530,7 @@ ASTDeclReader::FindExistingResult ASTDeclReader::findExisting(NamedDecl *D) {
   // same template specialization into the same CXXRecordDecl.
   auto MergedDCIt = Reader.MergedDeclContexts.find(D->getLexicalDeclContext());
   if (MergedDCIt != Reader.MergedDeclContexts.end() &&
-      !D->shouldSkipCheckingODR() && MergedDCIt->second == D->getDeclContext())
+      !shouldSkipCheckingODR(D) && MergedDCIt->second == D->getDeclContext())
     Reader.PendingOdrMergeChecks.push_back(D);
 
   return FindExistingResult(Reader, D, /*Existing=*/nullptr,
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 85b7fd5535a1bf..ce6fa1feb1eeb3 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6188,7 +6188,7 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
 
   BitsPacker DefinitionBits;
 
-  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
+  bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D);
   DefinitionBits.addBit(ShouldSkipCheckingODR);
 
 #define FIELD(Name, Width, Merge)                                              \
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 276b6257f1d841..d0d49bcdf991a9 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -526,7 +526,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
   BitsPacker EnumDeclBits;
   EnumDeclBits.addBits(D->getNumPositiveBits(), /*BitWidth=*/8);
   EnumDeclBits.addBits(D->getNumNegativeBits(), /*BitWidth=*/8);
-  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
+  bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D);
   EnumDeclBits.addBit(ShouldSkipCheckingODR);
   EnumDeclBits.addBit(D->isScoped());
   EnumDeclBits.addBit(D->isScopedUsingClassTag());
@@ -552,7 +552,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
       !D->isTopLevelDeclInObjCContainer() &&
       !CXXRecordDecl::classofKind(D->getKind()) &&
       !D->getIntegerTypeSourceInfo() && !D->getMemberSpecializationInfo() &&
-      !needsAnonymousDeclarationNumber(D) && !D->shouldSkipCheckingODR() &&
+      !needsAnonymousDeclarationNumber(D) && !shouldSkipCheckingODR(D) &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier)
     AbbrevToUse = Writer.getDeclEnumAbbrev();
 
@@ -718,7 +718,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   // FIXME: stable encoding
   FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3);
   FunctionDeclBits.addBits((uint32_t)D->getStorageClass(), /*BitWidth=*/3);
-  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
+  bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D);
   FunctionDeclBits.addBit(ShouldSkipCheckingODR);
   FunctionDeclBits.addBit(D->isInlineSpecified());
   FunctionDeclBits.addBit(D->isInlined());
@@ -1559,7 +1559,7 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) {
       D->getFirstDecl() == D->getMostRecentDecl() && !D->isInvalidDecl() &&
       !D->hasAttrs() && !D->isTopLevelDeclInObjCContainer() &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier &&
-      !D->shouldSkipCheckingODR() && !D->hasExtInfo() &&
+      !shouldSkipCheckingODR(D) && !D->hasExtInfo() &&
       !D->isExplicitlyDefaulted()) {
     if (D->getTemplatedKind() == FunctionDecl::TK_NonTemplate ||
         D->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate ||
diff --git a/clang/lib/Serialization/GlobalModuleIndex.cpp b/clang/lib/Serialization/GlobalModuleIndex.cpp
index dd4fc3e009050f..8ff10f6a8621e8 100644
--- a/clang/lib/Serialization/GlobalModuleIndex.cpp
+++ b/clang/lib/Serialization/GlobalModuleIndex.cpp
@@ -89,10 +89,8 @@ class IdentifierIndexReaderTrait {
   static std::pair<unsigned, unsigned>
   ReadKeyDataLength(const unsigned char*& d) {
     using namespace llvm::support;
-    unsigned KeyLen =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
-    unsigned DataLen =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
+    unsigned KeyLen = endian::readNext<uint16_t, llvm::endianness::little>(d);
+    unsigned DataLen = endian::readNext<uint16_t, llvm::endianness::little>(d);
     return std::make_pair(KeyLen, DataLen);
   }
 
@@ -113,8 +111,7 @@ class IdentifierIndexReaderTrait {
 
     data_type Result;
     while (DataLen > 0) {
-      unsigned ID =
-          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+      unsigned ID = endian::readNext<uint32_t, llvm::endianness::little>(d);
       Result.push_back(ID);
       DataLen -= 4;
     }
@@ -514,8 +511,7 @@ namespace {
       // The first bit indicates whether this identifier is interesting.
       // That's all we care about.
       using namespace llvm::support;
-      unsigned RawID =
-          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+      unsigned RawID = endian::readNext<uint32_t, llvm::endianness::little>(d);
       bool IsInteresting = RawID & 0x01;
       return std::make_pair(k, IsInteresting);
     }
diff --git a/clang/lib/Serialization/MultiOnDiskHashTable.h b/clang/lib/Serialization/MultiOnDiskHashTable.h
index 2402a628b512fb..a0d75ec3a9e76e 100644
--- a/clang/lib/Serialization/MultiOnDiskHashTable.h
+++ b/clang/lib/Serialization/MultiOnDiskHashTable.h
@@ -200,11 +200,11 @@ template<typename Info> class MultiOnDiskHashTable {
     storage_type Ptr = Data;
 
     uint32_t BucketOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint32_t, llvm::endianness::little>(Ptr);
 
     // Read the list of overridden files.
     uint32_t NumFiles =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint32_t, llvm::endianness::little>(Ptr);
     // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
     // an additional copy.
     llvm::SmallVector<file_type, 16> OverriddenFiles;
diff --git a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
index 01e46fa8591c07..1a75d7b52ad6e9 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
@@ -6,7 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This checker evaluates clang builtin functions.
+// This checker evaluates "standalone" clang builtin functions that are not
+// just special-cased variants of well-known non-builtin functions.
+// Builtin functions like __builtin_memcpy and __builtin_alloca should be
+// evaluated by the same checker that handles their non-builtin variant to
+// ensure that the two variants are handled consistently.
 //
 //===----------------------------------------------------------------------===//
 
@@ -80,25 +84,6 @@ bool BuiltinFunctionChecker::evalCall(const CallEvent &Call,
     return true;
   }
 
-  case Builtin::BI__builtin_alloca_with_align:
-  case Builtin::BI__builtin_alloca: {
-    SValBuilder &SVB = C.getSValBuilder();
-    const loc::MemRegionVal R =
-        SVB.getAllocaRegionVal(CE, C.getLocationContext(), C.blockCount());
-
-    // Set the extent of the region in bytes. This enables us to use the SVal
-    // of the argument directly. If we saved the extent in bits, it'd be more
-    // difficult to reason about values like symbol*8.
-    auto Size = Call.getArgSVal(0);
-    if (auto DefSize = Size.getAs<DefinedOrUnknownSVal>()) {
-      // This `getAs()` is mostly paranoia, because core.CallAndMessage reports
-      // undefined function arguments (unless it's disabled somehow).
-      state = setDynamicExtent(state, R.getRegion(), *DefSize, SVB);
-    }
-    C.addTransition(state->BindExpr(CE, LCtx, R));
-    return true;
-  }
-
   case Builtin::BI__builtin_dynamic_object_size:
   case Builtin::BI__builtin_object_size:
   case Builtin::BI__builtin_constant_p: {
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index 88fb42b6625aa4..11651fd491f743 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -401,10 +401,11 @@ class MallocChecker
   };
 
   const CallDescriptionMap<CheckFn> FreeingMemFnMap{
-      {{{"free"}, 1}, &MallocChecker::checkFree},
-      {{{"if_freenameindex"}, 1}, &MallocChecker::checkIfFreeNameIndex},
-      {{{"kfree"}, 1}, &MallocChecker::checkFree},
-      {{{"g_free"}, 1}, &MallocChecker::checkFree},
+      {{CDM::CLibrary, {"free"}, 1}, &MallocChecker::checkFree},
+      {{CDM::CLibrary, {"if_freenameindex"}, 1},
+       &MallocChecker::checkIfFreeNameIndex},
+      {{CDM::CLibrary, {"kfree"}, 1}, &MallocChecker::checkFree},
+      {{CDM::CLibrary, {"g_free"}, 1}, &MallocChecker::checkFree},
   };
 
   bool isFreeingCall(const CallEvent &Call) const;
@@ -413,41 +414,46 @@ class MallocChecker
   friend class NoOwnershipChangeVisitor;
 
   CallDescriptionMap<CheckFn> AllocatingMemFnMap{
-      {{{"alloca"}, 1}, &MallocChecker::checkAlloca},
-      {{{"_alloca"}, 1}, &MallocChecker::checkAlloca},
-      {{{"malloc"}, 1}, &MallocChecker::checkBasicAlloc},
-      {{{"malloc"}, 3}, &MallocChecker::checkKernelMalloc},
-      {{{"calloc"}, 2}, &MallocChecker::checkCalloc},
-      {{{"valloc"}, 1}, &MallocChecker::checkBasicAlloc},
+      {{CDM::CLibrary, {"alloca"}, 1}, &MallocChecker::checkAlloca},
+      {{CDM::CLibrary, {"_alloca"}, 1}, &MallocChecker::checkAlloca},
+      // The line for "alloca" also covers "__builtin_alloca", but the
+      // _with_align variant must be listed separately because it takes an
+      // extra argument:
+      {{CDM::CLibrary, {"__builtin_alloca_with_align"}, 2},
+       &MallocChecker::checkAlloca},
+      {{CDM::CLibrary, {"malloc"}, 1}, &MallocChecker::checkBasicAlloc},
+      {{CDM::CLibrary, {"malloc"}, 3}, &MallocChecker::checkKernelMalloc},
+      {{CDM::CLibrary, {"calloc"}, 2}, &MallocChecker::checkCalloc},
+      {{CDM::CLibrary, {"valloc"}, 1}, &MallocChecker::checkBasicAlloc},
       {{CDM::CLibrary, {"strndup"}, 2}, &MallocChecker::checkStrdup},
       {{CDM::CLibrary, {"strdup"}, 1}, &MallocChecker::checkStrdup},
-      {{{"_strdup"}, 1}, &MallocChecker::checkStrdup},
-      {{{"kmalloc"}, 2}, &MallocChecker::checkKernelMalloc},
-      {{{"if_nameindex"}, 1}, &MallocChecker::checkIfNameIndex},
+      {{CDM::CLibrary, {"_strdup"}, 1}, &MallocChecker::checkStrdup},
+      {{CDM::CLibrary, {"kmalloc"}, 2}, &MallocChecker::checkKernelMalloc},
+      {{CDM::CLibrary, {"if_nameindex"}, 1}, &MallocChecker::checkIfNameIndex},
       {{CDM::CLibrary, {"wcsdup"}, 1}, &MallocChecker::checkStrdup},
       {{CDM::CLibrary, {"_wcsdup"}, 1}, &MallocChecker::checkStrdup},
-      {{{"g_malloc"}, 1}, &MallocChecker::checkBasicAlloc},
-      {{{"g_malloc0"}, 1}, &MallocChecker::checkGMalloc0},
-      {{{"g_try_malloc"}, 1}, &MallocChecker::checkBasicAlloc},
-      {{{"g_try_malloc0"}, 1}, &MallocChecker::checkGMalloc0},
-      {{{"g_memdup"}, 2}, &MallocChecker::checkGMemdup},
-      {{{"g_malloc_n"}, 2}, &MallocChecker::checkGMallocN},
-      {{{"g_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0},
-      {{{"g_try_malloc_n"}, 2}, &MallocChecker::checkGMallocN},
-      {{{"g_try_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0},
+      {{CDM::CLibrary, {"g_malloc"}, 1}, &MallocChecker::checkBasicAlloc},
+      {{CDM::CLibrary, {"g_malloc0"}, 1}, &MallocChecker::checkGMalloc0},
+      {{CDM::CLibrary, {"g_try_malloc"}, 1}, &MallocChecker::checkBasicAlloc},
+      {{CDM::CLibrary, {"g_try_malloc0"}, 1}, &MallocChecker::checkGMalloc0},
+      {{CDM::CLibrary, {"g_memdup"}, 2}, &MallocChecker::checkGMemdup},
+      {{CDM::CLibrary, {"g_malloc_n"}, 2}, &MallocChecker::checkGMallocN},
+      {{CDM::CLibrary, {"g_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0},
+      {{CDM::CLibrary, {"g_try_malloc_n"}, 2}, &MallocChecker::checkGMallocN},
+      {{CDM::CLibrary, {"g_try_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0},
   };
 
   CallDescriptionMap<CheckFn> ReallocatingMemFnMap{
-      {{{"realloc"}, 2},
+      {{CDM::CLibrary, {"realloc"}, 2},
        std::bind(&MallocChecker::checkRealloc, _1, _2, _3, false)},
-      {{{"reallocf"}, 2},
+      {{CDM::CLibrary, {"reallocf"}, 2},
        std::bind(&MallocChecker::checkRealloc, _1, _2, _3, true)},
-      {{{"g_realloc"}, 2},
+      {{CDM::CLibrary, {"g_realloc"}, 2},
        std::bind(&MallocChecker::checkRealloc, _1, _2, _3, false)},
-      {{{"g_try_realloc"}, 2},
+      {{CDM::CLibrary, {"g_try_realloc"}, 2},
        std::bind(&MallocChecker::checkRealloc, _1, _2, _3, false)},
-      {{{"g_realloc_n"}, 3}, &MallocChecker::checkReallocN},
-      {{{"g_try_realloc_n"}, 3}, &MallocChecker::checkReallocN},
+      {{CDM::CLibrary, {"g_realloc_n"}, 3}, &MallocChecker::checkReallocN},
+      {{CDM::CLibrary, {"g_try_realloc_n"}, 3}, &MallocChecker::checkReallocN},
 
       // NOTE: the following CallDescription also matches the C++ standard
       // library function std::getline(); the callback will filter it out.
@@ -1259,9 +1265,6 @@ static bool isStandardRealloc(const CallEvent &Call) {
   assert(FD);
   ASTContext &AC = FD->getASTContext();
 
-  if (isa<CXXMethodDecl>(FD))
-    return false;
-
   return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy &&
          FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy &&
          FD->getParamDecl(1)->getType().getDesugaredType(AC) ==
@@ -1273,9 +1276,6 @@ static bool isGRealloc(const CallEvent &Call) {
   assert(FD);
   ASTContext &AC = FD->getASTContext();
 
-  if (isa<CXXMethodDecl>(FD))
-    return false;
-
   return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy &&
          FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy &&
          FD->getParamDecl(1)->getType().getDesugaredType(AC) ==
@@ -1284,14 +1284,14 @@ static bool isGRealloc(const CallEvent &Call) {
 
 void MallocChecker::checkRealloc(const CallEvent &Call, CheckerContext &C,
                                  bool ShouldFreeOnFail) const {
-  // HACK: CallDescription currently recognizes non-standard realloc functions
-  // as standard because it doesn't check the type, or wether its a non-method
-  // function. This should be solved by making CallDescription smarter.
-  // Mind that this came from a bug report, and all other functions suffer from
-  // this.
-  // https://bugs.llvm.org/show_bug.cgi?id=46253
+  // Ignore calls to functions whose type does not match the expected type of
+  // either the standard realloc or g_realloc from GLib.
+  // FIXME: Should we perform this kind of checking consistently for each
+  // function? If yes, then perhaps extend the `CallDescription` interface to
+  // handle this.
   if (!isStandardRealloc(Call) && !isGRealloc(Call))
     return;
+
   ProgramStateRef State = C.getState();
   State = ReallocMemAux(C, Call, ShouldFreeOnFail, State, AF_Malloc);
   State = ProcessZeroAllocCheck(Call, 1, State);
@@ -1842,9 +1842,18 @@ static ProgramStateRef MallocUpdateRefState(CheckerContext &C, const Expr *E,
     return nullptr;
 
   SymbolRef Sym = RetVal->getAsLocSymbol();
+
   // This is a return value of a function that was not inlined, such as malloc()
   // or new(). We've checked that in the caller. Therefore, it must be a symbol.
   assert(Sym);
+  // FIXME: In theory this assertion should fail for `alloca()` calls (because
+  // `AllocaRegion`s are not symbolic); but in practice this does not happen.
+  // As the current code appears to work correctly, I'm not touching this issue
+  // now, but it would be good to investigate and clarify this.
+  // Also note that perhaps the special `AllocaRegion` should be replaced by
+  // `SymbolicRegion` (or turned into a subclass of `SymbolicRegion`) to enable
+  // proper tracking of memory allocated by `alloca()` -- and after that change
+  // this assertion would become valid again.
 
   // Set the symbol's state to Allocated.
   return State->set<RegionState>(Sym, RefState::getAllocated(Family, E));
diff --git a/clang/test/AST/Interp/functions.cpp b/clang/test/AST/Interp/functions.cpp
index 4fb3c816000ab8..f9bb5d53634e0b 100644
--- a/clang/test/AST/Interp/functions.cpp
+++ b/clang/test/AST/Interp/functions.cpp
@@ -584,9 +584,20 @@ namespace VariadicOperator {
 namespace WeakCompare {
   [[gnu::weak]]void weak_method();
   static_assert(weak_method != nullptr, ""); // both-error {{not an integral constant expression}} \
-                                         // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}}
+                                             // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}}
 
   constexpr auto A = &weak_method;
   static_assert(A != nullptr, ""); // both-error {{not an integral constant expression}} \
-                               // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}}
+                                   // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}}
+}
+
+namespace FromIntegral {
+#if __cplusplus >= 202002L
+  typedef double (*DoubleFn)();
+  int a[(int)DoubleFn((void*)-1)()]; // both-error {{not allowed at file scope}} \
+                                    // both-warning {{variable length arrays}}
+  int b[(int)DoubleFn((void*)(-1 + 1))()]; // both-error {{not allowed at file scope}} \
+                                           // expected-note {{evaluates to a null function pointer}} \
+                                           // both-warning {{variable length arrays}}
+#endif
 }
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index f251497ed70182..2c33fa1bf88432 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1309,3 +1309,11 @@ namespace pr18633 {
     func2<int>();
   }
 }
+
+namespace {
+  struct F {
+    static constexpr int Z = 12;
+  };
+  F f;
+  static_assert(f.Z == 12, "");
+}
diff --git a/clang/test/AST/Interp/vectors.cpp b/clang/test/AST/Interp/vectors.cpp
index 8afef3c897bff7..6c5d916f51f563 100644
--- a/clang/test/AST/Interp/vectors.cpp
+++ b/clang/test/AST/Interp/vectors.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
 // RUN: %clang_cc1 -verify=ref,both %s
 
-// both-no-diagnostics
+// ref-no-diagnostics
 
 typedef int __attribute__((vector_size(16))) VI4;
 constexpr VI4 A = {1,2,3,4};
@@ -13,10 +13,18 @@ namespace Vector {
     return VI4 { n * 3, n + 4, n - 5, n / 6 };
   }
   constexpr auto v1 = f(10);
+  static_assert(__builtin_vectorelements(v1) == (16 / sizeof(int)), "");
 
   typedef double __attribute__((vector_size(32))) VD4;
   constexpr VD4 g(int n) {
     return (VD4) { n / 2.0, n + 1.5, n - 5.4, n * 0.9 };
   }
   constexpr auto v2 = g(4);
+  static_assert(__builtin_vectorelements(v2) == (32 / sizeof(double)), "");
+}
+
+/// FIXME: We need to support BitCasts between vector types.
+namespace {
+  typedef float __attribute__((vector_size(16))) VI42;
+  constexpr VI42 A2 = A; // expected-error {{must be initialized by a constant expression}}
 }
diff --git a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
index 85db68d41a6c80..1c2be322f83c20 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
@@ -1106,6 +1106,7 @@ using ostream = basic_ostream<char>;
 extern std::ostream cout;
 
 ostream &operator<<(ostream &, const string &);
+
 #if __cplusplus >= 202002L
 template <class T>
 ostream &operator<<(ostream &, const std::unique_ptr<T> &);
@@ -1122,11 +1123,12 @@ istream &getline(istream &, string &, char);
 istream &getline(istream &, string &);
 } // namespace std
 
-#ifdef TEST_INLINABLE_ALLOCATORS
 namespace std {
   void *malloc(size_t);
   void free(void *);
-}
+} // namespace std
+
+#ifdef TEST_INLINABLE_ALLOCATORS
 void* operator new(std::size_t size, const std::nothrow_t&) throw() { return std::malloc(size); }
 void* operator new[](std::size_t size, const std::nothrow_t&) throw() { return std::malloc(size); }
 void operator delete(void* ptr, const std::nothrow_t&) throw() { std::free(ptr); }
diff --git a/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp b/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp
index fc067dd04428a8..f46a2c9bc368f6 100644
--- a/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp
+++ b/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,optin.cplusplus.UninitializedObject \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,optin.cplusplus.UninitializedObject \
 // RUN:   -analyzer-config optin.cplusplus.UninitializedObject:Pedantic=true -DPEDANTIC \
 // RUN:   -analyzer-config optin.cplusplus.UninitializedObject:CheckPointeeInitialization=true \
 // RUN:   -std=c++11 -verify  %s
 
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,optin.cplusplus.UninitializedObject \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,optin.cplusplus.UninitializedObject \
 // RUN:   -analyzer-config optin.cplusplus.UninitializedObject:CheckPointeeInitialization=true \
 // RUN:   -std=c++11 -verify  %s
 
@@ -316,7 +316,10 @@ void fCyclicPointerTest2() {
 
 // Void pointer tests are mainly no-crash tests.
 
-void *malloc(int size);
+typedef __typeof(sizeof(int)) size_t;
+
+void *calloc(size_t nmemb, size_t size);
+void free(void *p);
 
 class VoidPointerTest1 {
   void *vptr;
@@ -328,8 +331,9 @@ class VoidPointerTest1 {
 };
 
 void fVoidPointerTest1() {
-  void *vptr = malloc(sizeof(int));
+  void *vptr = calloc(1, sizeof(int));
   VoidPointerTest1(vptr, char());
+  free(vptr);
 }
 
 class VoidPointerTest2 {
@@ -342,8 +346,9 @@ class VoidPointerTest2 {
 };
 
 void fVoidPointerTest2() {
-  void *vptr = malloc(sizeof(int));
+  void *vptr = calloc(1, sizeof(int));
   VoidPointerTest2(&vptr, char());
+  free(vptr);
 }
 
 class VoidPointerRRefTest1 {
@@ -359,8 +364,9 @@ upon returning to the caller.  This will be a dangling reference}}
 };
 
 void fVoidPointerRRefTest1() {
-  void *vptr = malloc(sizeof(int));
+  void *vptr = calloc(1, sizeof(int));
   VoidPointerRRefTest1(vptr, char());
+  free(vptr);
 }
 
 class VoidPointerRRefTest2 {
@@ -376,8 +382,9 @@ upon returning to the caller.  This will be a dangling reference}}
 };
 
 void fVoidPointerRRefTest2() {
-  void *vptr = malloc(sizeof(int));
+  void *vptr = calloc(1, sizeof(int));
   VoidPointerRRefTest2(&vptr, char());
+  free(vptr);
 }
 
 class VoidPointerLRefTest {
@@ -393,8 +400,9 @@ upon returning to the caller.  This will be a dangling reference}}
 };
 
 void fVoidPointerLRefTest() {
-  void *vptr = malloc(sizeof(int));
+  void *vptr = calloc(1, sizeof(int));
   VoidPointerLRefTest(vptr, char());
+  free(vptr);
 }
 
 struct CyclicVoidPointerTest {
diff --git a/clang/test/Analysis/exercise-ps.c b/clang/test/Analysis/exercise-ps.c
index d214c3959b2078..d1e1771afddb5e 100644
--- a/clang/test/Analysis/exercise-ps.c
+++ b/clang/test/Analysis/exercise-ps.c
@@ -1,5 +1,5 @@
 // RUN: %clang_analyze_cc1 %s -verify -Wno-error=implicit-function-declaration \
-// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=core,unix.Malloc \
 // RUN:   -analyzer-config core.CallAndMessage:ArgPointeeInitializedness=true
 //
 // Just exercise the analyzer on code that has at one point caused issues
diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp
index 30368b6976cc23..33fce10c4e2b2c 100644
--- a/clang/test/Analysis/explain-svals.cpp
+++ b/clang/test/Analysis/explain-svals.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_analyze_cc1 -triple i386-apple-darwin10 -verify %s \
-// RUN:   -analyzer-checker=core.builtin \
 // RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-checker=unix.cstring \
+// RUN:   -analyzer-checker=unix.Malloc \
 // RUN:   -analyzer-config display-checker-name=false
 
 typedef unsigned long size_t;
diff --git a/clang/test/Analysis/malloc-std-namespace.cpp b/clang/test/Analysis/malloc-std-namespace.cpp
new file mode 100644
index 00000000000000..d4e397bb812aa9
--- /dev/null
+++ b/clang/test/Analysis/malloc-std-namespace.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -analyzer-output=text %s
+
+// This file tests that unix.Malloc can handle C++ code where e.g. malloc and
+// free are declared within the namespace 'std' by the header <cstdlib>.
+
+#include "Inputs/system-header-simulator-cxx.h"
+
+void leak() {
+  int *p = static_cast<int*>(std::malloc(sizeof(int))); // expected-note{{Memory is allocated}}
+} // expected-warning{{Potential leak of memory pointed to by 'p'}}
+  // expected-note@-1{{Potential leak of memory pointed to by 'p'}}
+
+void no_leak() {
+  int *p = static_cast<int*>(std::malloc(sizeof(int)));
+  std::free(p); // no-warning
+}
+
+void invalid_free() {
+  int i;
+  int *p = &i;
+  //expected-note@+2{{Argument to free() is the address of the local variable 'i', which is not memory allocated by malloc()}}
+  //expected-warning@+1{{Argument to free() is the address of the local variable 'i', which is not memory allocated by malloc()}}
+  std::free(p);
+}
diff --git a/clang/test/Analysis/malloc.c b/clang/test/Analysis/malloc.c
index 09cd4b0bfce638..e5cb45ba733524 100644
--- a/clang/test/Analysis/malloc.c
+++ b/clang/test/Analysis/malloc.c
@@ -740,6 +740,17 @@ void allocaFree(void) {
   free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
 }
 
+void allocaFreeBuiltin(void) {
+  int *p = __builtin_alloca(sizeof(int));
+  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+}
+
+void allocaFreeBuiltinAlign(void) {
+  int *p = __builtin_alloca_with_align(sizeof(int), 64);
+  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+}
+
+
 int* mallocEscapeRet(void) {
   int *p = malloc(12);
   return p; // no warning
diff --git a/clang/test/Analysis/malloc.cpp b/clang/test/Analysis/malloc.cpp
index 14b4c0576384f2..300b344ab25d69 100644
--- a/clang/test/Analysis/malloc.cpp
+++ b/clang/test/Analysis/malloc.cpp
@@ -214,3 +214,14 @@ void *realloc(void **ptr, size_t size) { realloc(ptr, size); } // no-crash
 namespace pr46253_paramty2{
 void *realloc(void *ptr, int size) { realloc(ptr, size); } // no-crash
 } // namespace pr46253_paramty2
+
+namespace pr81597 {
+struct S {};
+struct T {
+  void free(const S& s);
+};
+void f(T& t) {
+  S s;
+  t.free(s); // no-warning: This is not the free you are looking for...
+}
+} // namespace pr81597
diff --git a/clang/test/Analysis/stack-addr-ps.c b/clang/test/Analysis/stack-addr-ps.c
index e469396e1bb22a..e69ab4189b524f 100644
--- a/clang/test/Analysis/stack-addr-ps.c
+++ b/clang/test/Analysis/stack-addr-ps.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core -fblocks -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -fblocks -verify %s
 
 int* f1(void) {
   int x = 0;
diff --git a/clang/test/Analysis/stackaddrleak.c b/clang/test/Analysis/stackaddrleak.c
index 0583bfc18711c5..39c29f2a2635b5 100644
--- a/clang/test/Analysis/stackaddrleak.c
+++ b/clang/test/Analysis/stackaddrleak.c
@@ -1,5 +1,5 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify -std=c99 -Dbool=_Bool -Wno-bool-conversion %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify -x c++ -Wno-bool-conversion %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -std=c99 -Dbool=_Bool -Wno-bool-conversion %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -x c++ -Wno-bool-conversion %s
 
 typedef __INTPTR_TYPE__ intptr_t;
 char const *p;
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index acff367d50eb91..c184f314f68f80 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -251,11 +251,11 @@
 
 // RUN: %clang_cc1 -triple spir-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=SPIR
-// SPIR: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+// SPIR: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"
 
 // RUN: %clang_cc1 -triple spir64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=SPIR64
-// SPIR64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+// SPIR64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"
 
 // RUN: %clang_cc1 -triple bpfel -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=BPFEL
diff --git a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
index 95deee8bb1f1f2..0a51b0e4121c33 100644
--- a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
+++ b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
@@ -391,3 +391,19 @@ void ArrayInitWithContinue() {
                      })};
   }
 }
+
+struct [[clang::trivial_abi]] HasTrivialABI {
+  HasTrivialABI();
+  ~HasTrivialABI();
+};
+void AcceptTrivialABI(HasTrivialABI, int);
+void TrivialABI() {
+  // CHECK-LABEL: define dso_local void @_Z10TrivialABIv()
+  AcceptTrivialABI(HasTrivialABI(), ({
+                     if (foo()) return;
+                     // CHECK:      if.then:
+                     // CHECK-NEXT:   call void @_ZN13HasTrivialABID1Ev
+                     // CHECK-NEXT:   br label %return
+                     0;
+                   }));
+}
diff --git a/clang/test/CodeGenCXX/module-funcs-from-imports.cppm b/clang/test/CodeGenCXX/module-funcs-from-imports.cppm
index 33cdf437110a9e..a2a9122fc39130 100644
--- a/clang/test/CodeGenCXX/module-funcs-from-imports.cppm
+++ b/clang/test/CodeGenCXX/module-funcs-from-imports.cppm
@@ -23,6 +23,21 @@ int func_in_gmf_not_called() {
     return 44;
 }
 
+template <class T>
+class A {
+public:
+    __attribute__((always_inline))
+    inline constexpr int getValue() {
+        return 43;
+    }
+
+    inline constexpr int getValue2() {
+        return 43;
+    }
+};
+
+extern template class A<char>;
+
 //--- M.cppm
 module;
 #include "foo.h"
@@ -47,17 +62,21 @@ int always_inline_func() {
     return 45;
 }
 
+export using ::A;
+
 //--- Use.cpp
 import M;
 int use() {
-    return exported_func() + always_inline_func();
+    A<char> a;
+    return exported_func() + always_inline_func() +
+           a.getValue() + a.getValue2();
 }
 
-// Checks that none of the function (except the always_inline_func) in the importees
-// are generated in the importer's code.
 // CHECK-O0: define{{.*}}_Z3usev(
 // CHECK-O0: declare{{.*}}_ZW1M13exported_funcv(
-// CHECK-O0: define{{.*}}available_externally{{.*}}_ZW1M18always_inline_funcv(
+// CHECK-O0: declare{{.*}}_ZW1M18always_inline_funcv(
+// CHECK-O0: define{{.*}}@_ZN1AIcE8getValueEv(
+// CHECK-O0: declare{{.*}}@_ZN1AIcE9getValue2Ev(
 // CHECK-O0-NOT: func_in_gmf
 // CHECK-O0-NOT: func_in_gmf_not_called
 // CHECK-O0-NOT: non_exported_func
@@ -68,7 +87,9 @@ int use() {
 // O0 to keep consistent ABI.
 // CHECK-O1: define{{.*}}_Z3usev(
 // CHECK-O1: declare{{.*}}_ZW1M13exported_funcv(
-// CHECK-O1: define{{.*}}available_externally{{.*}}_ZW1M18always_inline_funcv(
+// CHECK-O1: declare{{.*}}_ZW1M18always_inline_funcv(
+// CHECK-O1: define{{.*}}@_ZN1AIcE8getValueEv(
+// CHECK-O1: declare{{.*}}@_ZN1AIcE9getValue2Ev(
 // CHECK-O1-NOT: func_in_gmf
 // CHECK-O1-NOT: func_in_gmf_not_called
 // CHECK-O1-NOT: non_exported_func
diff --git a/clang/test/Driver/windows-seh-async-verify.cpp b/clang/test/Driver/windows-seh-async-verify.cpp
new file mode 100644
index 00000000000000..ace93cf44a31d2
--- /dev/null
+++ b/clang/test/Driver/windows-seh-async-verify.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang --target=x86_64-pc-windows -fasync-exceptions -fsyntax-only -### %s 2>&1 | FileCheck %s
+// RUN: %clang_cl --target=x86_64-pc-windows /EHa -fsyntax-only -### -- %s 2>&1 | FileCheck %s
+// RUN: %clang --target=x86_64-pc-windows-gnu -fasync-exceptions -fsyntax-only -### %s 2>&1 | FileCheck %s --check-prefixes=GNU-ALL,GNU
+// RUN: %clang_cl --target=x86_64-pc-windows-gnu /EHa -fsyntax-only -### -- %s 2>&1 | FileCheck %s --check-prefixes=GNU-ALL,CL-GNU
+
+// CHECK-NOT: warning
+// GNU: warning: argument unused during compilation: '-fasync-exceptions' [-Wunused-command-line-argument]
+// CL-GNU: warning: argument unused during compilation: '/EHa' [-Wunused-command-line-argument]
+
+// CHECK: -fasync-exceptions
+// GNU-ALL-NOT: -fasync-exceptions
+struct S {
+    union _Un {
+        ~_Un() {}
+        char _Buf[12];
+    };
+    _Un _un;
+};
+
+struct Embed {
+    S v2;
+};
+
+void PR62449() { Embed v{}; }
diff --git a/clang/test/Modules/hashing-decls-in-exprs-from-gmf-2.cppm b/clang/test/Modules/hashing-decls-in-exprs-from-gmf-2.cppm
new file mode 100644
index 00000000000000..66143102cb9e40
--- /dev/null
+++ b/clang/test/Modules/hashing-decls-in-exprs-from-gmf-2.cppm
@@ -0,0 +1,44 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/test.cpp -fprebuilt-module-path=%t -fsyntax-only -verify
+
+//--- header.h
+#pragma once
+template <class _Tp>
+class Optional {};
+
+template <class _Tp>
+concept C = requires(const _Tp& __t) {
+    []<class _Up>(const Optional<_Up>&) {}(__t);
+};
+
+//--- func.h
+#include "header.h"
+template <C T>
+void func() {}
+
+//--- test_func.h
+#include "func.h"
+
+inline void test_func() {
+    func<Optional<int>>();
+}
+
+//--- A.cppm
+module;
+#include "header.h"
+#include "test_func.h"
+export module A;
+export using ::test_func;
+
+//--- test.cpp
+// expected-no-diagnostics
+import A;
+#include "test_func.h"
+
+void test() {
+    test_func();
+}
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 00dafb2988c690..d8bcc3da4b8b1c 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -3098,6 +3098,58 @@ TEST(TransferTest, ResultObjectLocationForCXXOperatorCallExpr) {
       });
 }
 
+// Check that the `std::strong_ordering` object returned by builtin `<=>` has a
+// correctly modeled result object location.
+TEST(TransferTest, ResultObjectLocationForBuiltinSpaceshipOperator) {
+  std::string Code = R"(
+    namespace std {
+      // This is the minimal definition required to get
+      // `Sema::CheckComparisonCategoryType()` to accept this fake.
+      struct strong_ordering {
+        enum class ordering { less, equal, greater };
+        ordering o;
+        static const strong_ordering less;
+        static const strong_ordering equivalent;
+        static const strong_ordering equal;
+        static const strong_ordering greater;
+      };
+
+      inline constexpr strong_ordering strong_ordering::less =
+        { strong_ordering::ordering::less };
+      inline constexpr strong_ordering strong_ordering::equal =
+        { strong_ordering::ordering::equal };
+      inline constexpr strong_ordering strong_ordering::equivalent =
+        { strong_ordering::ordering::equal };
+      inline constexpr strong_ordering strong_ordering::greater =
+        { strong_ordering::ordering::greater };
+    }
+    void target(int i, int j) {
+      auto ordering = i <=> j;
+      // [[p]]
+    }
+  )";
+  using ast_matchers::binaryOperator;
+  using ast_matchers::hasOperatorName;
+  using ast_matchers::match;
+  using ast_matchers::selectFirst;
+  using ast_matchers::traverse;
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        auto *Spaceship = selectFirst<BinaryOperator>(
+            "op",
+            match(binaryOperator(hasOperatorName("<=>")).bind("op"), ASTCtx));
+
+        EXPECT_EQ(
+            &Env.getResultObjectLocation(*Spaceship),
+            &getLocForDecl<RecordStorageLocation>(ASTCtx, Env, "ordering"));
+      },
+      LangStandard::lang_cxx20);
+}
+
 TEST(TransferTest, ResultObjectLocationForStdInitializerListExpr) {
   std::string Code = R"(
     namespace std {
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index ccb93e104dab65..848619cb65d909 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -657,6 +657,14 @@ CALL CO_REDUCE
 CALL CO_SUM
 ```
 
+### Inquiry Functions
+ACCESS (GNU extension) is not supported on Windows. Otherwise:
+```
+CHARACTER(LEN=*) :: path = 'path/to/file'
+IF (ACCESS(path, 'rwx')) &
+  ...
+```
+
 ## Non-standard intrinsics
 ### PGI
 ```
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index dff1cdb20cbfef..c181c7ed62dff3 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3190,4 +3190,36 @@ def fir_CUDADataTransferOp : fir_Op<"cuda_data_transfer", []> {
   }];
 }
 
+def fir_CUDAAllocateOp : fir_Op<"cuda_allocate", [AttrSizedOperandSegments,
+    MemoryEffects<[MemAlloc<DefaultResource>]>]> {
+  let summary = "Perform the device allocation of data of an allocatable";
+
+  let description = [{
+    The fir.cuda_allocate operation performs the allocation on the device
+    of the data of an allocatable. The descriptor passed to the operation
+    is initialized before with the standard flang runtime calls.
+  }];
+
+  let arguments = (ins Arg<AnyRefOrBoxType, "", [MemWrite]>:$box,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
+                       Optional<AnyIntegerType>:$stream,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
+                       fir_CUDADataAttributeAttr:$cuda_attr,
+                       UnitAttr:$hasStat);
+
+  let results = (outs AnyIntegerType:$stat);
+
+  let assemblyFormat = [{
+    $box `:` qualified(type($box))
+    ( `source` `(` $source^ `:` qualified(type($source) )`)` )?
+    ( `errmsg` `(` $errmsg^ `:` type($errmsg) `)` )?
+    ( `stream` `(` $stream^ `:` type($stream) `)` )?
+    ( `pinned` `(` $pinned^ `:` type($pinned) `)` )?
+    attr-dict `->` type($stat)
+  }];
+
+  let hasVerifier = 1;
+}
+
 #endif
diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
index 4c6a8064991ab0..3b876e4642da9a 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
@@ -625,6 +625,7 @@ def AnyRefOrBoxLike : TypeConstraint<Or<[AnyReferenceLike.predicate,
 def AnyRefOrBox : TypeConstraint<Or<[fir_ReferenceType.predicate,
     fir_HeapType.predicate, fir_PointerType.predicate,
     IsBaseBoxTypePred]>, "any reference or box">;
+def AnyRefOrBoxType : Type<AnyRefOrBox.predicate, "any legal ref or box type">;
 
 def AnyShapeLike : TypeConstraint<Or<[fir_ShapeType.predicate,
     fir_ShapeShiftType.predicate]>, "any legal shape type">;
diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h
index 7d0952206fc195..fef651f3b2eedb 100644
--- a/flang/include/flang/Runtime/extensions.h
+++ b/flang/include/flang/Runtime/extensions.h
@@ -44,5 +44,12 @@ std::int64_t RTNAME(Signal)(std::int64_t number, void (*handler)(int));
 // GNU extension subroutine SLEEP(SECONDS)
 void RTNAME(Sleep)(std::int64_t seconds);
 
+// GNU extension function ACCESS(NAME, MODE)
+// TODO: not supported on Windows
+#ifndef _WIN32
+std::int64_t FORTRAN_PROCEDURE_NAME(access)(const char *name,
+    std::int64_t nameLength, const char *mode, std::int64_t modeLength);
+#endif
+
 } // extern "C"
 #endif // FORTRAN_RUNTIME_EXTENSIONS_H_
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index ae0d8bd37228df..4c51b61f6bf029 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -832,8 +832,8 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
 }
 
 bool ClauseProcessor::processMap(
-    mlir::Location currentLocation, const llvm::omp::Directive &directive,
-    Fortran::lower::StatementContext &stmtCtx, mlir::omp::MapClauseOps &result,
+    mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx,
+    mlir::omp::MapClauseOps &result,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *mapSyms,
     llvm::SmallVectorImpl<mlir::Location> *mapSymLocs,
     llvm::SmallVectorImpl<mlir::Type> *mapSymTypes) const {
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index aa2c14b61e7565..3f9701310ebaeb 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -114,8 +114,7 @@ class ClauseProcessor {
   // They may be used later on to create the block_arguments for some of the
   // target directives that require it.
   bool processMap(
-      mlir::Location currentLocation, const llvm::omp::Directive &directive,
-      Fortran::lower::StatementContext &stmtCtx,
+      mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx,
       mlir::omp::MapClauseOps &result,
       llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *mapSyms =
           nullptr,
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 3dcfe0fd775dc5..9b997522366621 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -103,21 +103,6 @@ static fir::GlobalOp globalInitialization(
   return global;
 }
 
-static mlir::Operation *getCompareFromReductionOp(mlir::Operation *reductionOp,
-                                                  mlir::Value loadVal) {
-  for (mlir::Value reductionOperand : reductionOp->getOperands()) {
-    if (mlir::Operation *compareOp = reductionOperand.getDefiningOp()) {
-      if (compareOp->getOperand(0) == loadVal ||
-          compareOp->getOperand(1) == loadVal)
-        assert((mlir::isa<mlir::arith::CmpIOp>(compareOp) ||
-                mlir::isa<mlir::arith::CmpFOp>(compareOp)) &&
-               "Expected comparison not found in reduction intrinsic");
-      return compareOp;
-    }
-  }
-  return nullptr;
-}
-
 // Get the extended value for \p val by extracting additional variable
 // information from \p base.
 static fir::ExtendedValue getExtendedValue(fir::ExtendedValue base,
@@ -237,290 +222,353 @@ createAndSetPrivatizedLoopVar(Fortran::lower::AbstractConverter &converter,
   return storeOp;
 }
 
-static mlir::Operation *
-findReductionChain(mlir::Value loadVal, mlir::Value *reductionVal = nullptr) {
-  for (mlir::OpOperand &loadOperand : loadVal.getUses()) {
-    if (mlir::Operation *reductionOp = loadOperand.getOwner()) {
-      if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(reductionOp)) {
-        for (mlir::OpOperand &convertOperand : convertOp.getRes().getUses()) {
-          if (mlir::Operation *reductionOp = convertOperand.getOwner())
-            return reductionOp;
-        }
-      }
-      for (mlir::OpOperand &reductionOperand : reductionOp->getUses()) {
-        if (auto store =
-                mlir::dyn_cast<fir::StoreOp>(reductionOperand.getOwner())) {
-          if (store.getMemref() == *reductionVal) {
-            store.erase();
-            return reductionOp;
-          }
-        }
-        if (auto assign =
-                mlir::dyn_cast<hlfir::AssignOp>(reductionOperand.getOwner())) {
-          if (assign.getLhs() == *reductionVal) {
-            assign.erase();
-            return reductionOp;
-          }
-        }
-      }
+// This helper function implements the functionality of "promoting"
+// non-CPTR arguments of use_device_ptr to use_device_addr
+// arguments (automagic conversion of use_device_ptr ->
+// use_device_addr in these cases). The way we do so currently is
+// through the shuffling of operands from the devicePtrOperands to
+// deviceAddrOperands where neccesary and re-organizing the types,
+// locations and symbols to maintain the correct ordering of ptr/addr
+// input -> BlockArg.
+//
+// This effectively implements some deprecated OpenMP functionality
+// that some legacy applications unfortunately depend on
+// (deprecated in specification version 5.2):
+//
+// "If a list item in a use_device_ptr clause is not of type C_PTR,
+//  the behavior is as if the list item appeared in a use_device_addr
+//  clause. Support for such list items in a use_device_ptr clause
+//  is deprecated."
+static void promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(
+    mlir::omp::UseDeviceClauseOps &clauseOps,
+    llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
+    llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
+        &useDeviceSymbols) {
+  auto moveElementToBack = [](size_t idx, auto &vector) {
+    auto *iter = std::next(vector.begin(), idx);
+    vector.push_back(*iter);
+    vector.erase(iter);
+  };
+
+  // Iterate over our use_device_ptr list and shift all non-cptr arguments into
+  // use_device_addr.
+  for (auto *it = clauseOps.useDevicePtrVars.begin();
+       it != clauseOps.useDevicePtrVars.end();) {
+    if (!fir::isa_builtin_cptr_type(fir::unwrapRefType(it->getType()))) {
+      clauseOps.useDeviceAddrVars.push_back(*it);
+      // We have to shuffle the symbols around as well, to maintain
+      // the correct Input -> BlockArg for use_device_ptr/use_device_addr.
+      // NOTE: However, as map's do not seem to be included currently
+      // this isn't as pertinent, but we must try to maintain for
+      // future alterations. I believe the reason they are not currently
+      // is that the BlockArg assign/lowering needs to be extended
+      // to a greater set of types.
+      auto idx = std::distance(clauseOps.useDevicePtrVars.begin(), it);
+      moveElementToBack(idx, useDeviceTypes);
+      moveElementToBack(idx, useDeviceLocs);
+      moveElementToBack(idx, useDeviceSymbols);
+      it = clauseOps.useDevicePtrVars.erase(it);
+      continue;
     }
+    ++it;
   }
-  return nullptr;
 }
 
-// for a logical operator 'op' reduction X = X op Y
-// This function returns the operation responsible for converting Y from
-// fir.logical<4> to i1
-static fir::ConvertOp getConvertFromReductionOp(mlir::Operation *reductionOp,
-                                                mlir::Value loadVal) {
-  for (mlir::Value reductionOperand : reductionOp->getOperands()) {
-    if (auto convertOp =
-            mlir::dyn_cast<fir::ConvertOp>(reductionOperand.getDefiningOp())) {
-      if (convertOp.getOperand() == loadVal)
-        continue;
-      return convertOp;
+/// Extract the list of function and variable symbols affected by the given
+/// 'declare target' directive and return the intended device type for them.
+static void getDeclareTargetInfo(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
+    mlir::omp::DeclareTargetClauseOps &clauseOps,
+    llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
+  const auto &spec = std::get<Fortran::parser::OmpDeclareTargetSpecifier>(
+      declareTargetConstruct.t);
+  if (const auto *objectList{
+          Fortran::parser::Unwrap<Fortran::parser::OmpObjectList>(spec.u)}) {
+    ObjectList objects{makeObjects(*objectList, semaCtx)};
+    // Case: declare target(func, var1, var2)
+    gatherFuncAndVarSyms(objects, mlir::omp::DeclareTargetCaptureClause::to,
+                         symbolAndClause);
+  } else if (const auto *clauseList{
+                 Fortran::parser::Unwrap<Fortran::parser::OmpClauseList>(
+                     spec.u)}) {
+    if (clauseList->v.empty()) {
+      // Case: declare target, implicit capture of function
+      symbolAndClause.emplace_back(
+          mlir::omp::DeclareTargetCaptureClause::to,
+          eval.getOwningProcedure()->getSubprogramSymbol());
     }
-  }
-  return nullptr;
-}
 
-static void updateReduction(mlir::Operation *op,
-                            fir::FirOpBuilder &firOpBuilder,
-                            mlir::Value loadVal, mlir::Value reductionVal,
-                            fir::ConvertOp *convertOp = nullptr) {
-  mlir::OpBuilder::InsertPoint insertPtDel = firOpBuilder.saveInsertionPoint();
-  firOpBuilder.setInsertionPoint(op);
+    ClauseProcessor cp(converter, semaCtx, *clauseList);
+    cp.processDeviceType(clauseOps);
+    cp.processEnter(symbolAndClause);
+    cp.processLink(symbolAndClause);
+    cp.processTo(symbolAndClause);
 
-  mlir::Value reductionOp;
-  if (convertOp)
-    reductionOp = convertOp->getOperand();
-  else if (op->getOperand(0) == loadVal)
-    reductionOp = op->getOperand(1);
-  else
-    reductionOp = op->getOperand(0);
-
-  firOpBuilder.create<mlir::omp::ReductionOp>(op->getLoc(), reductionOp,
-                                              reductionVal);
-  firOpBuilder.restoreInsertionPoint(insertPtDel);
-}
-
-static void removeStoreOp(mlir::Operation *reductionOp, mlir::Value symVal) {
-  for (mlir::Operation *reductionOpUse : reductionOp->getUsers()) {
-    if (auto convertReduction =
-            mlir::dyn_cast<fir::ConvertOp>(reductionOpUse)) {
-      for (mlir::Operation *convertReductionUse :
-           convertReduction.getRes().getUsers()) {
-        if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(convertReductionUse)) {
-          if (storeOp.getMemref() == symVal)
-            storeOp.erase();
-        }
-        if (auto assignOp =
-                mlir::dyn_cast<hlfir::AssignOp>(convertReductionUse)) {
-          if (assignOp.getLhs() == symVal)
-            assignOp.erase();
-        }
-      }
-    }
+    cp.processTODO<clause::Indirect>(converter.getCurrentLocation(),
+                                     llvm::omp::Directive::OMPD_declare_target);
   }
 }
 
-// Generate an OpenMP reduction operation.
-// TODO: Currently assumes it is either an integer addition/multiplication
-// reduction, or a logical and reduction. Generalize this for various reduction
-// operation types.
-// TODO: Generate the reduction operation during lowering instead of creating
-// and removing operations since this is not a robust approach. Also, removing
-// ops in the builder (instead of a rewriter) is probably not the best approach.
-static void
-genOpenMPReduction(Fortran::lower::AbstractConverter &converter,
-                   Fortran::semantics::SemanticsContext &semaCtx,
-                   const Fortran::parser::OmpClauseList &clauseList) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+static void collectDeferredDeclareTargets(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
+    llvm::SmallVectorImpl<Fortran::lower::OMPDeferredDeclareTargetInfo>
+        &deferredDeclareTarget) {
+  mlir::omp::DeclareTargetClauseOps clauseOps;
+  llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
+  getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
+                       clauseOps, symbolAndClause);
+  // Return the device type only if at least one of the targets for the
+  // directive is a function or subroutine
+  mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
 
-  List<Clause> clauses{makeClauses(clauseList, semaCtx)};
-
-  for (const Clause &clause : clauses) {
-    if (const auto &reductionClause =
-            std::get_if<clause::Reduction>(&clause.u)) {
-      const auto &redOperatorList{
-          std::get<clause::Reduction::ReductionIdentifiers>(
-              reductionClause->t)};
-      assert(redOperatorList.size() == 1 && "Expecting single operator");
-      const auto &redOperator = redOperatorList.front();
-      const auto &objects{std::get<ObjectList>(reductionClause->t)};
-      if (const auto *reductionOp =
-              std::get_if<clause::DefinedOperator>(&redOperator.u)) {
-        const auto &intrinsicOp{
-            std::get<clause::DefinedOperator::IntrinsicOperator>(
-                reductionOp->u)};
-
-        switch (intrinsicOp) {
-        case clause::DefinedOperator::IntrinsicOperator::Add:
-        case clause::DefinedOperator::IntrinsicOperator::Multiply:
-        case clause::DefinedOperator::IntrinsicOperator::AND:
-        case clause::DefinedOperator::IntrinsicOperator::EQV:
-        case clause::DefinedOperator::IntrinsicOperator::OR:
-        case clause::DefinedOperator::IntrinsicOperator::NEQV:
-          break;
-        default:
-          continue;
-        }
-        for (const Object &object : objects) {
-          if (const Fortran::semantics::Symbol *symbol = object.id()) {
-            mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
-            if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
-              reductionVal = declOp.getBase();
-            mlir::Type reductionType =
-                reductionVal.getType().cast<fir::ReferenceType>().getEleTy();
-            if (!reductionType.isa<fir::LogicalType>()) {
-              if (!reductionType.isIntOrIndexOrFloat())
-                continue;
-            }
-            for (mlir::OpOperand &reductionValUse : reductionVal.getUses()) {
-              if (auto loadOp =
-                      mlir::dyn_cast<fir::LoadOp>(reductionValUse.getOwner())) {
-                mlir::Value loadVal = loadOp.getRes();
-                if (reductionType.isa<fir::LogicalType>()) {
-                  mlir::Operation *reductionOp = findReductionChain(loadVal);
-                  fir::ConvertOp convertOp =
-                      getConvertFromReductionOp(reductionOp, loadVal);
-                  updateReduction(reductionOp, firOpBuilder, loadVal,
-                                  reductionVal, &convertOp);
-                  removeStoreOp(reductionOp, reductionVal);
-                } else if (mlir::Operation *reductionOp =
-                               findReductionChain(loadVal, &reductionVal)) {
-                  updateReduction(reductionOp, firOpBuilder, loadVal,
-                                  reductionVal);
-                }
-              }
-            }
-          }
-        }
-      } else if (const auto *reductionIntrinsic =
-                     std::get_if<clause::ProcedureDesignator>(&redOperator.u)) {
-        if (!ReductionProcessor::supportedIntrinsicProcReduction(
-                *reductionIntrinsic))
-          continue;
-        ReductionProcessor::ReductionIdentifier redId =
-            ReductionProcessor::getReductionType(*reductionIntrinsic);
-        for (const Object &object : objects) {
-          if (const Fortran::semantics::Symbol *symbol = object.id()) {
-            mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
-            if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
-              reductionVal = declOp.getBase();
-            for (const mlir::OpOperand &reductionValUse :
-                 reductionVal.getUses()) {
-              if (auto loadOp =
-                      mlir::dyn_cast<fir::LoadOp>(reductionValUse.getOwner())) {
-                mlir::Value loadVal = loadOp.getRes();
-                // Max is lowered as a compare -> select.
-                // Match the pattern here.
-                mlir::Operation *reductionOp =
-                    findReductionChain(loadVal, &reductionVal);
-                if (reductionOp == nullptr)
-                  continue;
-
-                if (redId == ReductionProcessor::ReductionIdentifier::MAX ||
-                    redId == ReductionProcessor::ReductionIdentifier::MIN) {
-                  assert(mlir::isa<mlir::arith::SelectOp>(reductionOp) &&
-                         "Selection Op not found in reduction intrinsic");
-                  mlir::Operation *compareOp =
-                      getCompareFromReductionOp(reductionOp, loadVal);
-                  updateReduction(compareOp, firOpBuilder, loadVal,
-                                  reductionVal);
-                }
-                if (redId == ReductionProcessor::ReductionIdentifier::IOR ||
-                    redId == ReductionProcessor::ReductionIdentifier::IEOR ||
-                    redId == ReductionProcessor::ReductionIdentifier::IAND) {
-                  updateReduction(reductionOp, firOpBuilder, loadVal,
-                                  reductionVal);
-                }
-              }
-            }
-          }
-        }
-      }
+  for (const DeclareTargetCapturePair &symClause : symbolAndClause) {
+    mlir::Operation *op = mod.lookupSymbol(converter.mangleName(
+        std::get<const Fortran::semantics::Symbol &>(symClause)));
+
+    if (!op) {
+      deferredDeclareTarget.push_back({std::get<0>(symClause),
+                                       clauseOps.deviceType,
+                                       std::get<1>(symClause)});
     }
   }
 }
 
-struct OpWithBodyGenInfo {
-  /// A type for a code-gen callback function. This takes as argument the op for
-  /// which the code is being generated and returns the arguments of the op's
-  /// region.
-  using GenOMPRegionEntryCBFn =
-      std::function<llvm::SmallVector<const Fortran::semantics::Symbol *>(
-          mlir::Operation *)>;
+static std::optional<mlir::omp::DeclareTargetDeviceType>
+getDeclareTargetFunctionDevice(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OpenMPDeclareTargetConstruct
+        &declareTargetConstruct) {
+  mlir::omp::DeclareTargetClauseOps clauseOps;
+  llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
+  getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
+                       clauseOps, symbolAndClause);
 
-  OpWithBodyGenInfo(Fortran::lower::AbstractConverter &converter,
-                    Fortran::semantics::SemanticsContext &semaCtx,
-                    mlir::Location loc, Fortran::lower::pft::Evaluation &eval)
-      : converter(converter), semaCtx(semaCtx), loc(loc), eval(eval) {}
+  // Return the device type only if at least one of the targets for the
+  // directive is a function or subroutine
+  mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
+  for (const DeclareTargetCapturePair &symClause : symbolAndClause) {
+    mlir::Operation *op = mod.lookupSymbol(converter.mangleName(
+        std::get<const Fortran::semantics::Symbol &>(symClause)));
 
-  OpWithBodyGenInfo &setGenNested(bool value) {
-    genNested = value;
-    return *this;
+    if (mlir::isa_and_nonnull<mlir::func::FuncOp>(op))
+      return clauseOps.deviceType;
   }
 
-  OpWithBodyGenInfo &setOuterCombined(bool value) {
-    outerCombined = value;
-    return *this;
-  }
+  return std::nullopt;
+}
 
-  OpWithBodyGenInfo &setClauses(const Fortran::parser::OmpClauseList *value) {
-    clauses = value;
-    return *this;
-  }
+static llvm::SmallVector<const Fortran::semantics::Symbol *>
+genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
+            mlir::Location &loc,
+            llvm::ArrayRef<const Fortran::semantics::Symbol *> args) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  auto &region = op->getRegion(0);
 
-  OpWithBodyGenInfo &setDataSharingProcessor(DataSharingProcessor *value) {
-    dsp = value;
-    return *this;
+  std::size_t loopVarTypeSize = 0;
+  for (const Fortran::semantics::Symbol *arg : args)
+    loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
+  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
+  llvm::SmallVector<mlir::Type> tiv(args.size(), loopVarType);
+  llvm::SmallVector<mlir::Location> locs(args.size(), loc);
+  firOpBuilder.createBlock(&region, {}, tiv, locs);
+  // The argument is not currently in memory, so make a temporary for the
+  // argument, and store it there, then bind that location to the argument.
+  mlir::Operation *storeOp = nullptr;
+  for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
+    mlir::Value indexVal = fir::getBase(region.front().getArgument(argIndex));
+    storeOp =
+        createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
   }
+  firOpBuilder.setInsertionPointAfter(storeOp);
+  return llvm::SmallVector<const Fortran::semantics::Symbol *>(args);
+}
 
-  OpWithBodyGenInfo &setReductions(
-      llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *value1,
-      llvm::SmallVectorImpl<mlir::Type> *value2) {
-    reductionSymbols = value1;
-    reductionTypes = value2;
-    return *this;
-  }
+static void genReductionVars(
+    mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
+    mlir::Location &loc,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> reductionArgs,
+    llvm::ArrayRef<mlir::Type> reductionTypes) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  llvm::SmallVector<mlir::Location> blockArgLocs(reductionArgs.size(), loc);
 
-  OpWithBodyGenInfo &setGenRegionEntryCb(GenOMPRegionEntryCBFn value) {
-    genRegionEntryCB = value;
-    return *this;
-  }
+  mlir::Block *entryBlock = firOpBuilder.createBlock(
+      &op->getRegion(0), {}, reductionTypes, blockArgLocs);
 
-  /// [inout] converter to use for the clauses.
-  Fortran::lower::AbstractConverter &converter;
-  /// [in] Semantics context
-  Fortran::semantics::SemanticsContext &semaCtx;
-  /// [in] location in source code.
-  mlir::Location loc;
-  /// [in] current PFT node/evaluation.
-  Fortran::lower::pft::Evaluation &eval;
-  /// [in] whether to generate FIR for nested evaluations
-  bool genNested = true;
-  /// [in] is this an outer operation - prevents privatization.
-  bool outerCombined = false;
-  /// [in] list of clauses to process.
-  const Fortran::parser::OmpClauseList *clauses = nullptr;
-  /// [in] if provided, processes the construct's data-sharing attributes.
-  DataSharingProcessor *dsp = nullptr;
-  /// [in] if provided, list of reduction symbols
-  llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols =
-      nullptr;
-  /// [in] if provided, list of reduction types
-  llvm::SmallVectorImpl<mlir::Type> *reductionTypes = nullptr;
-  /// [in] if provided, emits the op's region entry. Otherwise, an emtpy block
-  /// is created in the region.
-  GenOMPRegionEntryCBFn genRegionEntryCB = nullptr;
-};
+  // Bind the reduction arguments to their block arguments.
+  for (auto [arg, prv] :
+       llvm::zip_equal(reductionArgs, entryBlock->getArguments())) {
+    converter.bindSymbol(*arg, prv);
+  }
+}
 
-/// Create the body (block) for an OpenMP Operation.
-///
-/// \param [in]   op - the operation the body belongs to.
-/// \param [in] info - options controlling code-gen for the construction.
-template <typename Op>
+static llvm::SmallVector<const Fortran::semantics::Symbol *>
+genLoopAndReductionVars(
+    mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
+    mlir::Location &loc,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> loopArgs,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> reductionArgs,
+    llvm::ArrayRef<mlir::Type> reductionTypes) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
+  llvm::SmallVector<mlir::Type> blockArgTypes;
+  llvm::SmallVector<mlir::Location> blockArgLocs;
+  blockArgTypes.reserve(loopArgs.size() + reductionArgs.size());
+  blockArgLocs.reserve(blockArgTypes.size());
+  mlir::Block *entryBlock;
+
+  if (loopArgs.size()) {
+    std::size_t loopVarTypeSize = 0;
+    for (const Fortran::semantics::Symbol *arg : loopArgs)
+      loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
+    mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
+    std::fill_n(std::back_inserter(blockArgTypes), loopArgs.size(),
+                loopVarType);
+    std::fill_n(std::back_inserter(blockArgLocs), loopArgs.size(), loc);
+  }
+  if (reductionArgs.size()) {
+    llvm::copy(reductionTypes, std::back_inserter(blockArgTypes));
+    std::fill_n(std::back_inserter(blockArgLocs), reductionArgs.size(), loc);
+  }
+  entryBlock = firOpBuilder.createBlock(&op->getRegion(0), {}, blockArgTypes,
+                                        blockArgLocs);
+  // The argument is not currently in memory, so make a temporary for the
+  // argument, and store it there, then bind that location to the argument.
+  if (loopArgs.size()) {
+    mlir::Operation *storeOp = nullptr;
+    for (auto [argIndex, argSymbol] : llvm::enumerate(loopArgs)) {
+      mlir::Value indexVal =
+          fir::getBase(op->getRegion(0).front().getArgument(argIndex));
+      storeOp =
+          createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
+    }
+    firOpBuilder.setInsertionPointAfter(storeOp);
+  }
+  // Bind the reduction arguments to their block arguments
+  for (auto [arg, prv] : llvm::zip_equal(
+           reductionArgs,
+           llvm::drop_begin(entryBlock->getArguments(), loopArgs.size()))) {
+    converter.bindSymbol(*arg, prv);
+  }
+
+  return llvm::SmallVector<const Fortran::semantics::Symbol *>(loopArgs);
+}
+
+static void
+markDeclareTarget(mlir::Operation *op,
+                  Fortran::lower::AbstractConverter &converter,
+                  mlir::omp::DeclareTargetCaptureClause captureClause,
+                  mlir::omp::DeclareTargetDeviceType deviceType) {
+  // TODO: Add support for program local variables with declare target applied
+  auto declareTargetOp = llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(op);
+  if (!declareTargetOp)
+    fir::emitFatalError(
+        converter.getCurrentLocation(),
+        "Attempt to apply declare target on unsupported operation");
+
+  // The function or global already has a declare target applied to it, very
+  // likely through implicit capture (usage in another declare target
+  // function/subroutine). It should be marked as any if it has been assigned
+  // both host and nohost, else we skip, as there is no change
+  if (declareTargetOp.isDeclareTarget()) {
+    if (declareTargetOp.getDeclareTargetDeviceType() != deviceType)
+      declareTargetOp.setDeclareTarget(mlir::omp::DeclareTargetDeviceType::any,
+                                       captureClause);
+    return;
+  }
+
+  declareTargetOp.setDeclareTarget(deviceType, captureClause);
+}
+
+//===----------------------------------------------------------------------===//
+// Op body generation helper structures and functions
+//===----------------------------------------------------------------------===//
+
+struct OpWithBodyGenInfo {
+  /// A type for a code-gen callback function. This takes as argument the op for
+  /// which the code is being generated and returns the arguments of the op's
+  /// region.
+  using GenOMPRegionEntryCBFn =
+      std::function<llvm::SmallVector<const Fortran::semantics::Symbol *>(
+          mlir::Operation *)>;
+
+  OpWithBodyGenInfo(Fortran::lower::AbstractConverter &converter,
+                    Fortran::semantics::SemanticsContext &semaCtx,
+                    mlir::Location loc, Fortran::lower::pft::Evaluation &eval)
+      : converter(converter), semaCtx(semaCtx), loc(loc), eval(eval) {}
+
+  OpWithBodyGenInfo &setGenNested(bool value) {
+    genNested = value;
+    return *this;
+  }
+
+  OpWithBodyGenInfo &setOuterCombined(bool value) {
+    outerCombined = value;
+    return *this;
+  }
+
+  OpWithBodyGenInfo &setClauses(const Fortran::parser::OmpClauseList *value) {
+    clauses = value;
+    return *this;
+  }
+
+  OpWithBodyGenInfo &setDataSharingProcessor(DataSharingProcessor *value) {
+    dsp = value;
+    return *this;
+  }
+
+  OpWithBodyGenInfo &setReductions(
+      llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *value1,
+      llvm::SmallVectorImpl<mlir::Type> *value2) {
+    reductionSymbols = value1;
+    reductionTypes = value2;
+    return *this;
+  }
+
+  OpWithBodyGenInfo &setGenRegionEntryCb(GenOMPRegionEntryCBFn value) {
+    genRegionEntryCB = value;
+    return *this;
+  }
+
+  /// [inout] converter to use for the clauses.
+  Fortran::lower::AbstractConverter &converter;
+  /// [in] Semantics context
+  Fortran::semantics::SemanticsContext &semaCtx;
+  /// [in] location in source code.
+  mlir::Location loc;
+  /// [in] current PFT node/evaluation.
+  Fortran::lower::pft::Evaluation &eval;
+  /// [in] whether to generate FIR for nested evaluations
+  bool genNested = true;
+  /// [in] is this an outer operation - prevents privatization.
+  bool outerCombined = false;
+  /// [in] list of clauses to process.
+  const Fortran::parser::OmpClauseList *clauses = nullptr;
+  /// [in] if provided, processes the construct's data-sharing attributes.
+  DataSharingProcessor *dsp = nullptr;
+  /// [in] if provided, list of reduction symbols
+  llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols =
+      nullptr;
+  /// [in] if provided, list of reduction types
+  llvm::SmallVectorImpl<mlir::Type> *reductionTypes = nullptr;
+  /// [in] if provided, emits the op's region entry. Otherwise, an emtpy block
+  /// is created in the region.
+  GenOMPRegionEntryCBFn genRegionEntryCB = nullptr;
+};
+
+/// Create the body (block) for an OpenMP Operation.
+///
+/// \param [in]   op - the operation the body belongs to.
+/// \param [in] info - options controlling code-gen for the construction.
+template <typename Op>
 static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
   fir::FirOpBuilder &firOpBuilder = info.converter.getFirOpBuilder();
 
@@ -715,298 +763,336 @@ static void genBodyOfTargetDataOp(
     genNestedEvaluations(converter, eval);
 }
 
-template <typename OpTy, typename... Args>
-static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) {
-  auto op = info.converter.getFirOpBuilder().create<OpTy>(
-      info.loc, std::forward<Args>(args)...);
-  createBodyOfOp<OpTy>(op, info);
-  return op;
-}
-
-static mlir::omp::MasterOp
-genMasterOp(Fortran::lower::AbstractConverter &converter,
-            Fortran::semantics::SemanticsContext &semaCtx,
-            Fortran::lower::pft::Evaluation &eval, bool genNested,
-            mlir::Location currentLocation) {
-  return genOpWithBody<mlir::omp::MasterOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested));
-}
-
-static mlir::omp::OrderedRegionOp
-genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
-                   Fortran::semantics::SemanticsContext &semaCtx,
-                   Fortran::lower::pft::Evaluation &eval, bool genNested,
-                   mlir::Location currentLocation,
-                   const Fortran::parser::OmpClauseList &clauseList) {
-  mlir::omp::OrderedRegionClauseOps clauseOps;
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processTODO<clause::Simd>(currentLocation,
-                               llvm::omp::Directive::OMPD_ordered);
-
-  return genOpWithBody<mlir::omp::OrderedRegionOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested),
-      clauseOps);
-}
+// This functions creates a block for the body of the targetOp's region. It adds
+// all the symbols present in mapSymbols as block arguments to this block.
+static void
+genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
+                  Fortran::semantics::SemanticsContext &semaCtx,
+                  Fortran::lower::pft::Evaluation &eval, bool genNested,
+                  mlir::omp::TargetOp &targetOp,
+                  llvm::ArrayRef<const Fortran::semantics::Symbol *> mapSyms,
+                  llvm::ArrayRef<mlir::Location> mapSymLocs,
+                  llvm::ArrayRef<mlir::Type> mapSymTypes,
+                  const mlir::Location &currentLocation) {
+  assert(mapSymTypes.size() == mapSymLocs.size());
 
-static mlir::omp::ParallelOp
-genParallelOp(Fortran::lower::AbstractConverter &converter,
-              Fortran::lower::SymMap &symTable,
-              Fortran::semantics::SemanticsContext &semaCtx,
-              Fortran::lower::pft::Evaluation &eval, bool genNested,
-              mlir::Location currentLocation,
-              const Fortran::parser::OmpClauseList &clauseList,
-              bool outerCombined = false) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::ParallelClauseOps clauseOps;
-  llvm::SmallVector<const Fortran::semantics::Symbol *> privateSyms;
-  llvm::SmallVector<mlir::Type> reductionTypes;
-  llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSyms;
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(llvm::omp::Directive::OMPD_parallel, clauseOps);
-  cp.processNumThreads(stmtCtx, clauseOps);
-  cp.processProcBind(clauseOps);
-  cp.processDefault();
-  cp.processAllocate(clauseOps);
-
-  if (!outerCombined)
-    cp.processReduction(currentLocation, clauseOps, &reductionTypes,
-                        &reductionSyms);
+  mlir::Region &region = targetOp.getRegion();
 
-  if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars))
-    clauseOps.reductionByRefAttr = firOpBuilder.getUnitAttr();
+  auto *regionBlock =
+      firOpBuilder.createBlock(&region, {}, mapSymTypes, mapSymLocs);
 
-  auto reductionCallback = [&](mlir::Operation *op) {
-    llvm::SmallVector<mlir::Location> locs(clauseOps.reductionVars.size(),
-                                           currentLocation);
-    auto *block =
-        firOpBuilder.createBlock(&op->getRegion(0), {}, reductionTypes, locs);
-    for (auto [arg, prv] :
-         llvm::zip_equal(reductionSyms, block->getArguments())) {
-      converter.bindSymbol(*arg, prv);
+  // Clones the `bounds` placing them inside the target region and returns them.
+  auto cloneBound = [&](mlir::Value bound) {
+    if (mlir::isMemoryEffectFree(bound.getDefiningOp())) {
+      mlir::Operation *clonedOp = bound.getDefiningOp()->clone();
+      regionBlock->push_back(clonedOp);
+      return clonedOp->getResult(0);
     }
-    return reductionSyms;
+    TODO(converter.getCurrentLocation(),
+         "target map clause operand unsupported bound type");
   };
 
-  OpWithBodyGenInfo genInfo =
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setOuterCombined(outerCombined)
-          .setClauses(&clauseList)
-          .setReductions(&reductionSyms, &reductionTypes)
-          .setGenRegionEntryCb(reductionCallback);
-
-  if (!enableDelayedPrivatization)
-    return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
-
-  bool privatize = !outerCombined;
-  DataSharingProcessor dsp(converter, semaCtx, clauseList, eval,
-                           /*useDelayedPrivatization=*/true, &symTable);
-
-  if (privatize)
-    dsp.processStep1(&clauseOps, &privateSyms);
-
-  auto genRegionEntryCB = [&](mlir::Operation *op) {
-    auto parallelOp = llvm::cast<mlir::omp::ParallelOp>(op);
+  auto cloneBounds = [cloneBound](llvm::ArrayRef<mlir::Value> bounds) {
+    llvm::SmallVector<mlir::Value> clonedBounds;
+    for (mlir::Value bound : bounds)
+      clonedBounds.emplace_back(cloneBound(bound));
+    return clonedBounds;
+  };
 
-    llvm::SmallVector<mlir::Location> reductionLocs(
-        clauseOps.reductionVars.size(), currentLocation);
-
-    mlir::OperandRange privateVars = parallelOp.getPrivateVars();
-    mlir::Region &region = parallelOp.getRegion();
+  // Bind the symbols to their corresponding block arguments.
+  for (auto [argIndex, argSymbol] : llvm::enumerate(mapSyms)) {
+    const mlir::BlockArgument &arg = region.getArgument(argIndex);
+    // Avoid capture of a reference to a structured binding.
+    const Fortran::semantics::Symbol *sym = argSymbol;
+    // Structure component symbols don't have bindings.
+    if (sym->owner().IsDerivedType())
+      continue;
+    fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym);
+    extVal.match(
+        [&](const fir::BoxValue &v) {
+          converter.bindSymbol(*sym,
+                               fir::BoxValue(arg, cloneBounds(v.getLBounds()),
+                                             v.getExplicitParameters(),
+                                             v.getExplicitExtents()));
+        },
+        [&](const fir::MutableBoxValue &v) {
+          converter.bindSymbol(
+              *sym, fir::MutableBoxValue(arg, cloneBounds(v.getLBounds()),
+                                         v.getMutableProperties()));
+        },
+        [&](const fir::ArrayBoxValue &v) {
+          converter.bindSymbol(
+              *sym, fir::ArrayBoxValue(arg, cloneBounds(v.getExtents()),
+                                       cloneBounds(v.getLBounds()),
+                                       v.getSourceBox()));
+        },
+        [&](const fir::CharArrayBoxValue &v) {
+          converter.bindSymbol(
+              *sym, fir::CharArrayBoxValue(arg, cloneBound(v.getLen()),
+                                           cloneBounds(v.getExtents()),
+                                           cloneBounds(v.getLBounds())));
+        },
+        [&](const fir::CharBoxValue &v) {
+          converter.bindSymbol(*sym,
+                               fir::CharBoxValue(arg, cloneBound(v.getLen())));
+        },
+        [&](const fir::UnboxedValue &v) { converter.bindSymbol(*sym, arg); },
+        [&](const auto &) {
+          TODO(converter.getCurrentLocation(),
+               "target map clause operand unsupported type");
+        });
+  }
 
-    llvm::SmallVector<mlir::Type> privateVarTypes = reductionTypes;
-    privateVarTypes.reserve(privateVarTypes.size() + privateVars.size());
-    llvm::transform(privateVars, std::back_inserter(privateVarTypes),
-                    [](mlir::Value v) { return v.getType(); });
+  // Check if cloning the bounds introduced any dependency on the outer region.
+  // If so, then either clone them as well if they are MemoryEffectFree, or else
+  // copy them to a new temporary and add them to the map and block_argument
+  // lists and replace their uses with the new temporary.
+  llvm::SetVector<mlir::Value> valuesDefinedAbove;
+  mlir::getUsedValuesDefinedAbove(region, valuesDefinedAbove);
+  while (!valuesDefinedAbove.empty()) {
+    for (mlir::Value val : valuesDefinedAbove) {
+      mlir::Operation *valOp = val.getDefiningOp();
+      if (mlir::isMemoryEffectFree(valOp)) {
+        mlir::Operation *clonedOp = valOp->clone();
+        regionBlock->push_front(clonedOp);
+        val.replaceUsesWithIf(
+            clonedOp->getResult(0), [regionBlock](mlir::OpOperand &use) {
+              return use.getOwner()->getBlock() == regionBlock;
+            });
+      } else {
+        auto savedIP = firOpBuilder.getInsertionPoint();
+        firOpBuilder.setInsertionPointAfter(valOp);
+        auto copyVal =
+            firOpBuilder.createTemporary(val.getLoc(), val.getType());
+        firOpBuilder.createStoreWithConvert(copyVal.getLoc(), val, copyVal);
 
-    llvm::SmallVector<mlir::Location> privateVarLocs = reductionLocs;
-    privateVarLocs.reserve(privateVarLocs.size() + privateVars.size());
-    llvm::transform(privateVars, std::back_inserter(privateVarLocs),
-                    [](mlir::Value v) { return v.getLoc(); });
+        llvm::SmallVector<mlir::Value> bounds;
+        std::stringstream name;
+        firOpBuilder.setInsertionPoint(targetOp);
+        mlir::Value mapOp = createMapInfoOp(
+            firOpBuilder, copyVal.getLoc(), copyVal, mlir::Value{}, name.str(),
+            bounds, llvm::SmallVector<mlir::Value>{},
+            static_cast<
+                std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
+                llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
+            mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType());
+        targetOp.getMapOperandsMutable().append(mapOp);
+        mlir::Value clonedValArg =
+            region.addArgument(copyVal.getType(), copyVal.getLoc());
+        firOpBuilder.setInsertionPointToStart(regionBlock);
+        auto loadOp = firOpBuilder.create<fir::LoadOp>(clonedValArg.getLoc(),
+                                                       clonedValArg);
+        val.replaceUsesWithIf(
+            loadOp->getResult(0), [regionBlock](mlir::OpOperand &use) {
+              return use.getOwner()->getBlock() == regionBlock;
+            });
+        firOpBuilder.setInsertionPoint(regionBlock, savedIP);
+      }
+    }
+    valuesDefinedAbove.clear();
+    mlir::getUsedValuesDefinedAbove(region, valuesDefinedAbove);
+  }
 
-    firOpBuilder.createBlock(&region, /*insertPt=*/{}, privateVarTypes,
-                             privateVarLocs);
+  // Insert dummy instruction to remember the insertion position. The
+  // marker will be deleted since there are not uses.
+  // In the HLFIR flow there are hlfir.declares inserted above while
+  // setting block arguments.
+  mlir::Value undefMarker = firOpBuilder.create<fir::UndefOp>(
+      targetOp.getOperation()->getLoc(), firOpBuilder.getIndexType());
 
-    llvm::SmallVector<const Fortran::semantics::Symbol *> allSymbols =
-        reductionSyms;
-    allSymbols.append(privateSyms);
-    for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
-      converter.bindSymbol(*arg, prv);
-    }
+  // Create blocks for unstructured regions. This has to be done since
+  // blocks are initially allocated with the function as the parent region.
+  if (eval.lowerAsUnstructured()) {
+    Fortran::lower::createEmptyRegionBlocks<mlir::omp::TerminatorOp,
+                                            mlir::omp::YieldOp>(
+        firOpBuilder, eval.getNestedEvaluations());
+  }
 
-    return allSymbols;
-  };
+  firOpBuilder.create<mlir::omp::TerminatorOp>(currentLocation);
 
-  // TODO Merge with the reduction CB.
-  genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp);
-  return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
+  // Create the insertion point after the marker.
+  firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp());
+  if (genNested)
+    genNestedEvaluations(converter, eval);
 }
 
-static mlir::omp::SectionOp
-genSectionOp(Fortran::lower::AbstractConverter &converter,
-             Fortran::semantics::SemanticsContext &semaCtx,
-             Fortran::lower::pft::Evaluation &eval, bool genNested,
-             mlir::Location currentLocation,
-             const Fortran::parser::OmpClauseList &sectionsClauseList) {
-  // Currently only private/firstprivate clause is handled, and
-  // all privatization is done within `omp.section` operations.
-  return genOpWithBody<mlir::omp::SectionOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setClauses(&sectionsClauseList));
+template <typename OpTy, typename... Args>
+static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) {
+  auto op = info.converter.getFirOpBuilder().create<OpTy>(
+      info.loc, std::forward<Args>(args)...);
+  createBodyOfOp<OpTy>(op, info);
+  return op;
 }
 
-static mlir::omp::SingleOp
-genSingleOp(Fortran::lower::AbstractConverter &converter,
-            Fortran::semantics::SemanticsContext &semaCtx,
-            Fortran::lower::pft::Evaluation &eval, bool genNested,
-            mlir::Location currentLocation,
-            const Fortran::parser::OmpClauseList &beginClauseList,
-            const Fortran::parser::OmpClauseList &endClauseList) {
-  mlir::omp::SingleClauseOps clauseOps;
-
-  ClauseProcessor cp(converter, semaCtx, beginClauseList);
-  cp.processAllocate(clauseOps);
-  // TODO Support delayed privatization.
+//===----------------------------------------------------------------------===//
+// Code generation functions for clauses
+//===----------------------------------------------------------------------===//
 
-  ClauseProcessor ecp(converter, semaCtx, endClauseList);
-  ecp.processNowait(clauseOps);
-  ecp.processCopyprivate(currentLocation, clauseOps);
+static void genCriticalDeclareClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    mlir::omp::CriticalClauseOps &clauseOps, llvm::StringRef name) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processHint(clauseOps);
+  clauseOps.nameAttr =
+      mlir::StringAttr::get(converter.getFirOpBuilder().getContext(), name);
+}
 
-  return genOpWithBody<mlir::omp::SingleOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setClauses(&beginClauseList),
-      clauseOps);
+static void genFlushClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    const std::optional<Fortran::parser::OmpObjectList> &objects,
+    const std::optional<std::list<Fortran::parser::OmpMemoryOrderClause>>
+        &clauses,
+    mlir::Location loc, llvm::SmallVectorImpl<mlir::Value> &operandRange) {
+  if (objects)
+    genObjectList2(*objects, converter, operandRange);
+
+  if (clauses && clauses->size() > 0)
+    TODO(converter.getCurrentLocation(), "Handle OmpMemoryOrderClause");
 }
 
-static mlir::omp::TaskOp
-genTaskOp(Fortran::lower::AbstractConverter &converter,
-          Fortran::semantics::SemanticsContext &semaCtx,
-          Fortran::lower::pft::Evaluation &eval, bool genNested,
-          mlir::Location currentLocation,
-          const Fortran::parser::OmpClauseList &clauseList) {
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::TaskClauseOps clauseOps;
+static void
+genOrderedRegionClauses(Fortran::lower::AbstractConverter &converter,
+                        Fortran::semantics::SemanticsContext &semaCtx,
+                        const Fortran::parser::OmpClauseList &clauses,
+                        mlir::Location loc,
+                        mlir::omp::OrderedRegionClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processTODO<clause::Simd>(loc, llvm::omp::Directive::OMPD_ordered);
+}
 
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(llvm::omp::Directive::OMPD_task, clauseOps);
+static void genParallelClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::StatementContext &stmtCtx,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    bool processReduction, mlir::omp::ParallelClauseOps &clauseOps,
+    llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &reductionSyms) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDefault();
-  cp.processFinal(stmtCtx, clauseOps);
-  cp.processUntied(clauseOps);
-  cp.processMergeable(clauseOps);
-  cp.processPriority(stmtCtx, clauseOps);
-  cp.processDepend(clauseOps);
+  cp.processIf(llvm::omp::Directive::OMPD_parallel, clauseOps);
+  cp.processNumThreads(stmtCtx, clauseOps);
+  cp.processProcBind(clauseOps);
+
+  if (processReduction) {
+    cp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms);
+    if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars))
+      clauseOps.reductionByRefAttr = converter.getFirOpBuilder().getUnitAttr();
+  }
+}
+
+static void genSectionsClauses(Fortran::lower::AbstractConverter &converter,
+                               Fortran::semantics::SemanticsContext &semaCtx,
+                               const Fortran::parser::OmpClauseList &clauses,
+                               mlir::Location loc,
+                               bool clausesFromBeginSections,
+                               mlir::omp::SectionsClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  if (clausesFromBeginSections) {
+    cp.processAllocate(clauseOps);
+    cp.processSectionsReduction(loc, clauseOps);
+    // TODO Support delayed privatization.
+  } else {
+    cp.processNowait(clauseOps);
+  }
+}
+
+static void genSimdLoopClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::StatementContext &stmtCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    mlir::omp::SimdLoopClauseOps &clauseOps,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &iv) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processCollapse(loc, eval, clauseOps, iv);
+  cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
+  cp.processReduction(loc, clauseOps);
+  cp.processSafelen(clauseOps);
+  cp.processSimdlen(clauseOps);
+  clauseOps.loopInclusiveAttr = converter.getFirOpBuilder().getUnitAttr();
   // TODO Support delayed privatization.
 
-  cp.processTODO<clause::InReduction, clause::Detach, clause::Affinity>(
-      currentLocation, llvm::omp::Directive::OMPD_task);
+  cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
+                 clause::Nontemporal, clause::Order>(
+      loc, llvm::omp::Directive::OMPD_simd);
+}
+
+static void genSingleClauses(Fortran::lower::AbstractConverter &converter,
+                             Fortran::semantics::SemanticsContext &semaCtx,
+                             const Fortran::parser::OmpClauseList &beginClauses,
+                             const Fortran::parser::OmpClauseList &endClauses,
+                             mlir::Location loc,
+                             mlir::omp::SingleClauseOps &clauseOps) {
+  ClauseProcessor bcp(converter, semaCtx, beginClauses);
+  bcp.processAllocate(clauseOps);
+  // TODO Support delayed privatization.
 
-  return genOpWithBody<mlir::omp::TaskOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setClauses(&clauseList),
-      clauseOps);
+  ClauseProcessor ecp(converter, semaCtx, endClauses);
+  ecp.processCopyprivate(loc, clauseOps);
+  ecp.processNowait(clauseOps);
 }
 
-static mlir::omp::TaskgroupOp
-genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
-               Fortran::semantics::SemanticsContext &semaCtx,
-               Fortran::lower::pft::Evaluation &eval, bool genNested,
-               mlir::Location currentLocation,
-               const Fortran::parser::OmpClauseList &clauseList) {
-  mlir::omp::TaskgroupClauseOps clauseOps;
+static void genTargetClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::StatementContext &stmtCtx,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    bool processHostOnlyClauses, bool processReduction,
+    mlir::omp::TargetClauseOps &clauseOps,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &mapSyms,
+    llvm::SmallVectorImpl<mlir::Location> &mapLocs,
+    llvm::SmallVectorImpl<mlir::Type> &mapTypes,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &deviceAddrSyms,
+    llvm::SmallVectorImpl<mlir::Location> &deviceAddrLocs,
+    llvm::SmallVectorImpl<mlir::Type> &deviceAddrTypes,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &devicePtrSyms,
+    llvm::SmallVectorImpl<mlir::Location> &devicePtrLocs,
+    llvm::SmallVectorImpl<mlir::Type> &devicePtrTypes) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processDepend(clauseOps);
+  cp.processDevice(stmtCtx, clauseOps);
+  cp.processHasDeviceAddr(clauseOps, deviceAddrTypes, deviceAddrLocs,
+                          deviceAddrSyms);
+  cp.processIf(llvm::omp::Directive::OMPD_target, clauseOps);
+  cp.processIsDevicePtr(clauseOps, devicePtrTypes, devicePtrLocs,
+                        devicePtrSyms);
+  cp.processMap(loc, stmtCtx, clauseOps, &mapSyms, &mapLocs, &mapTypes);
+  cp.processThreadLimit(stmtCtx, clauseOps);
+  // TODO Support delayed privatization.
 
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processAllocate(clauseOps);
-  cp.processTODO<clause::TaskReduction>(currentLocation,
-                                        llvm::omp::Directive::OMPD_taskgroup);
+  if (processHostOnlyClauses)
+    cp.processNowait(clauseOps);
 
-  return genOpWithBody<mlir::omp::TaskgroupOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setClauses(&clauseList),
-      clauseOps);
+  cp.processTODO<clause::Allocate, clause::Defaultmap, clause::Firstprivate,
+                 clause::InReduction, clause::Private, clause::Reduction,
+                 clause::UsesAllocators>(loc,
+                                         llvm::omp::Directive::OMPD_target);
 }
 
-// This helper function implements the functionality of "promoting"
-// non-CPTR arguments of use_device_ptr to use_device_addr
-// arguments (automagic conversion of use_device_ptr ->
-// use_device_addr in these cases). The way we do so currently is
-// through the shuffling of operands from the devicePtrOperands to
-// deviceAddrOperands where neccesary and re-organizing the types,
-// locations and symbols to maintain the correct ordering of ptr/addr
-// input -> BlockArg.
-//
-// This effectively implements some deprecated OpenMP functionality
-// that some legacy applications unfortunately depend on
-// (deprecated in specification version 5.2):
-//
-// "If a list item in a use_device_ptr clause is not of type C_PTR,
-//  the behavior is as if the list item appeared in a use_device_addr
-//  clause. Support for such list items in a use_device_ptr clause
-//  is deprecated."
-static void promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(
-    mlir::omp::UseDeviceClauseOps &clauseOps,
+static void genTargetDataClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::StatementContext &stmtCtx,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    mlir::omp::TargetDataClauseOps &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
     llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
-    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
-        &useDeviceSymbols) {
-  auto moveElementToBack = [](size_t idx, auto &vector) {
-    auto *iter = std::next(vector.begin(), idx);
-    vector.push_back(*iter);
-    vector.erase(iter);
-  };
-
-  // Iterate over our use_device_ptr list and shift all non-cptr arguments into
-  // use_device_addr.
-  for (auto *it = clauseOps.useDevicePtrVars.begin();
-       it != clauseOps.useDevicePtrVars.end();) {
-    if (!fir::isa_builtin_cptr_type(fir::unwrapRefType(it->getType()))) {
-      clauseOps.useDeviceAddrVars.push_back(*it);
-      // We have to shuffle the symbols around as well, to maintain
-      // the correct Input -> BlockArg for use_device_ptr/use_device_addr.
-      // NOTE: However, as map's do not seem to be included currently
-      // this isn't as pertinent, but we must try to maintain for
-      // future alterations. I believe the reason they are not currently
-      // is that the BlockArg assign/lowering needs to be extended
-      // to a greater set of types.
-      auto idx = std::distance(clauseOps.useDevicePtrVars.begin(), it);
-      moveElementToBack(idx, useDeviceTypes);
-      moveElementToBack(idx, useDeviceLocs);
-      moveElementToBack(idx, useDeviceSymbols);
-      it = clauseOps.useDevicePtrVars.erase(it);
-      continue;
-    }
-    ++it;
-  }
-}
-
-static mlir::omp::TargetDataOp
-genTargetDataOp(Fortran::lower::AbstractConverter &converter,
-                Fortran::semantics::SemanticsContext &semaCtx,
-                Fortran::lower::pft::Evaluation &eval, bool genNested,
-                mlir::Location currentLocation,
-                const Fortran::parser::OmpClauseList &clauseList) {
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::TargetDataClauseOps clauseOps;
-  llvm::SmallVector<mlir::Type> useDeviceTypes;
-  llvm::SmallVector<mlir::Location> useDeviceLocs;
-  llvm::SmallVector<const Fortran::semantics::Symbol *> useDeviceSyms;
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(llvm::omp::Directive::OMPD_target_data, clauseOps);
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &useDeviceSyms) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processDevice(stmtCtx, clauseOps);
-  cp.processUseDevicePtr(clauseOps, useDeviceTypes, useDeviceLocs,
-                         useDeviceSyms);
+  cp.processIf(llvm::omp::Directive::OMPD_target_data, clauseOps);
+  cp.processMap(loc, stmtCtx, clauseOps);
   cp.processUseDeviceAddr(clauseOps, useDeviceTypes, useDeviceLocs,
                           useDeviceSyms);
+  cp.processUseDevicePtr(clauseOps, useDeviceTypes, useDeviceLocs,
+                         useDeviceSyms);
 
   // This function implements the deprecated functionality of use_device_ptr
   // that allows users to provide non-CPTR arguments to it with the caveat
@@ -1014,249 +1100,400 @@ genTargetDataOp(Fortran::lower::AbstractConverter &converter,
   // code may still depend on this functionality, so we should support it
   // in some manner. We do so currently by simply shifting non-cptr operands
   // from the use_device_ptr list into the front of the use_device_addr list
-  // whilst maintaining the ordering of useDeviceLocs, useDeviceSymbols and
+  // whilst maintaining the ordering of useDeviceLocs, useDeviceSyms and
   // useDeviceTypes to use_device_ptr/use_device_addr input for BlockArg
   // ordering.
   // TODO: Perhaps create a user provideable compiler option that will
   // re-introduce a hard-error rather than a warning in these cases.
   promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(clauseOps, useDeviceTypes,
                                                 useDeviceLocs, useDeviceSyms);
-  cp.processMap(currentLocation, llvm::omp::Directive::OMPD_target_data,
-                stmtCtx, clauseOps);
-
-  auto dataOp = converter.getFirOpBuilder().create<mlir::omp::TargetDataOp>(
-      currentLocation, clauseOps);
-
-  genBodyOfTargetDataOp(converter, semaCtx, eval, genNested, dataOp,
-                        useDeviceTypes, useDeviceLocs, useDeviceSyms,
-                        currentLocation);
-  return dataOp;
 }
 
-template <typename OpTy>
-static OpTy genTargetEnterExitDataUpdateOp(
+static void genTargetEnterExitUpdateDataClauses(
     Fortran::lower::AbstractConverter &converter,
     Fortran::semantics::SemanticsContext &semaCtx,
-    mlir::Location currentLocation,
-    const Fortran::parser::OmpClauseList &clauseList) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps;
-
-  // GCC 9.3.0 emits a (probably) bogus warning about an unused variable.
-  [[maybe_unused]] llvm::omp::Directive directive;
-  if constexpr (std::is_same_v<OpTy, mlir::omp::TargetEnterDataOp>) {
-    directive = llvm::omp::Directive::OMPD_target_enter_data;
-  } else if constexpr (std::is_same_v<OpTy, mlir::omp::TargetExitDataOp>) {
-    directive = llvm::omp::Directive::OMPD_target_exit_data;
-  } else if constexpr (std::is_same_v<OpTy, mlir::omp::TargetUpdateOp>) {
-    directive = llvm::omp::Directive::OMPD_target_update;
-  } else {
-    return nullptr;
-  }
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(directive, clauseOps);
-  cp.processDevice(stmtCtx, clauseOps);
+    Fortran::lower::StatementContext &stmtCtx,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    llvm::omp::Directive directive,
+    mlir::omp::TargetEnterExitUpdateDataClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processDepend(clauseOps);
+  cp.processDevice(stmtCtx, clauseOps);
+  cp.processIf(directive, clauseOps);
   cp.processNowait(clauseOps);
 
-  if constexpr (std::is_same_v<OpTy, mlir::omp::TargetUpdateOp>) {
+  if (directive == llvm::omp::Directive::OMPD_target_update) {
     cp.processMotionClauses<clause::To>(stmtCtx, clauseOps);
     cp.processMotionClauses<clause::From>(stmtCtx, clauseOps);
   } else {
-    cp.processMap(currentLocation, directive, stmtCtx, clauseOps);
+    cp.processMap(loc, stmtCtx, clauseOps);
   }
+}
 
-  return firOpBuilder.create<OpTy>(currentLocation, clauseOps);
+static void genTaskClauses(Fortran::lower::AbstractConverter &converter,
+                           Fortran::semantics::SemanticsContext &semaCtx,
+                           Fortran::lower::StatementContext &stmtCtx,
+                           const Fortran::parser::OmpClauseList &clauses,
+                           mlir::Location loc,
+                           mlir::omp::TaskClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processAllocate(clauseOps);
+  cp.processDefault();
+  cp.processDepend(clauseOps);
+  cp.processFinal(stmtCtx, clauseOps);
+  cp.processIf(llvm::omp::Directive::OMPD_task, clauseOps);
+  cp.processMergeable(clauseOps);
+  cp.processPriority(stmtCtx, clauseOps);
+  cp.processUntied(clauseOps);
+  // TODO Support delayed privatization.
+
+  cp.processTODO<clause::Affinity, clause::Detach, clause::InReduction>(
+      loc, llvm::omp::Directive::OMPD_task);
 }
 
-// This functions creates a block for the body of the targetOp's region. It adds
-// all the symbols present in mapSymbols as block arguments to this block.
-static void
-genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
-                  Fortran::semantics::SemanticsContext &semaCtx,
-                  Fortran::lower::pft::Evaluation &eval, bool genNested,
-                  mlir::omp::TargetOp &targetOp,
-                  llvm::ArrayRef<const Fortran::semantics::Symbol *> mapSyms,
-                  llvm::ArrayRef<mlir::Location> mapSymLocs,
-                  llvm::ArrayRef<mlir::Type> mapSymTypes,
-                  const mlir::Location &currentLocation) {
-  assert(mapSymTypes.size() == mapSymLocs.size());
+static void genTaskgroupClauses(Fortran::lower::AbstractConverter &converter,
+                                Fortran::semantics::SemanticsContext &semaCtx,
+                                const Fortran::parser::OmpClauseList &clauses,
+                                mlir::Location loc,
+                                mlir::omp::TaskgroupClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processAllocate(clauseOps);
+
+  cp.processTODO<clause::TaskReduction>(loc,
+                                        llvm::omp::Directive::OMPD_taskgroup);
+}
 
+static void genTaskwaitClauses(Fortran::lower::AbstractConverter &converter,
+                               Fortran::semantics::SemanticsContext &semaCtx,
+                               const Fortran::parser::OmpClauseList &clauses,
+                               mlir::Location loc,
+                               mlir::omp::TaskwaitClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processTODO<clause::Depend, clause::Nowait>(
+      loc, llvm::omp::Directive::OMPD_taskwait);
+}
+
+static void genTeamsClauses(Fortran::lower::AbstractConverter &converter,
+                            Fortran::semantics::SemanticsContext &semaCtx,
+                            Fortran::lower::StatementContext &stmtCtx,
+                            const Fortran::parser::OmpClauseList &clauses,
+                            mlir::Location loc,
+                            mlir::omp::TeamsClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processAllocate(clauseOps);
+  cp.processDefault();
+  cp.processIf(llvm::omp::Directive::OMPD_teams, clauseOps);
+  cp.processNumTeams(stmtCtx, clauseOps);
+  cp.processThreadLimit(stmtCtx, clauseOps);
+  // TODO Support delayed privatization.
+
+  cp.processTODO<clause::Reduction>(loc, llvm::omp::Directive::OMPD_teams);
+}
+
+static void genWsloopClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::StatementContext &stmtCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OmpClauseList &beginClauses,
+    const Fortran::parser::OmpClauseList *endClauses, mlir::Location loc,
+    mlir::omp::WsloopClauseOps &clauseOps,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &iv,
+    llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &reductionSyms) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  mlir::Region &region = targetOp.getRegion();
+  ClauseProcessor bcp(converter, semaCtx, beginClauses);
+  bcp.processCollapse(loc, eval, clauseOps, iv);
+  bcp.processOrdered(clauseOps);
+  bcp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms);
+  bcp.processSchedule(stmtCtx, clauseOps);
+  clauseOps.loopInclusiveAttr = firOpBuilder.getUnitAttr();
+  // TODO Support delayed privatization.
 
-  auto *regionBlock =
-      firOpBuilder.createBlock(&region, {}, mapSymTypes, mapSymLocs);
+  if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars))
+    clauseOps.reductionByRefAttr = firOpBuilder.getUnitAttr();
 
-  // Clones the `bounds` placing them inside the target region and returns them.
-  auto cloneBound = [&](mlir::Value bound) {
-    if (mlir::isMemoryEffectFree(bound.getDefiningOp())) {
-      mlir::Operation *clonedOp = bound.getDefiningOp()->clone();
-      regionBlock->push_back(clonedOp);
-      return clonedOp->getResult(0);
+  if (endClauses) {
+    ClauseProcessor ecp(converter, semaCtx, *endClauses);
+    ecp.processNowait(clauseOps);
+  }
+
+  bcp.processTODO<clause::Allocate, clause::Linear, clause::Order>(
+      loc, llvm::omp::Directive::OMPD_do);
+}
+
+//===----------------------------------------------------------------------===//
+// Code generation functions for leaf constructs
+//===----------------------------------------------------------------------===//
+
+static mlir::omp::BarrierOp
+genBarrierOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::semantics::SemanticsContext &semaCtx,
+             Fortran::lower::pft::Evaluation &eval, mlir::Location loc) {
+  return converter.getFirOpBuilder().create<mlir::omp::BarrierOp>(loc);
+}
+
+static mlir::omp::CriticalOp
+genCriticalOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, bool genNested,
+              mlir::Location loc,
+              const Fortran::parser::OmpClauseList &clauseList,
+              const std::optional<Fortran::parser::Name> &name) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  mlir::FlatSymbolRefAttr nameAttr;
+
+  if (name) {
+    std::string nameStr = name->ToString();
+    mlir::ModuleOp mod = firOpBuilder.getModule();
+    auto global = mod.lookupSymbol<mlir::omp::CriticalDeclareOp>(nameStr);
+    if (!global) {
+      mlir::omp::CriticalClauseOps clauseOps;
+      genCriticalDeclareClauses(converter, semaCtx, clauseList, loc, clauseOps,
+                                nameStr);
+
+      mlir::OpBuilder modBuilder(mod.getBodyRegion());
+      global = modBuilder.create<mlir::omp::CriticalDeclareOp>(loc, clauseOps);
     }
-    TODO(converter.getCurrentLocation(),
-         "target map clause operand unsupported bound type");
-  };
+    nameAttr = mlir::FlatSymbolRefAttr::get(firOpBuilder.getContext(),
+                                            global.getSymName());
+  }
 
-  auto cloneBounds = [cloneBound](llvm::ArrayRef<mlir::Value> bounds) {
-    llvm::SmallVector<mlir::Value> clonedBounds;
-    for (mlir::Value bound : bounds)
-      clonedBounds.emplace_back(cloneBound(bound));
-    return clonedBounds;
+  return genOpWithBody<mlir::omp::CriticalOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(genNested),
+      nameAttr);
+}
+
+static mlir::omp::DistributeOp
+genDistributeOp(Fortran::lower::AbstractConverter &converter,
+                Fortran::semantics::SemanticsContext &semaCtx,
+                Fortran::lower::pft::Evaluation &eval, bool genNested,
+                mlir::Location loc,
+                const Fortran::parser::OmpClauseList &clauseList) {
+  TODO(loc, "Distribute construct");
+  return nullptr;
+}
+
+static mlir::omp::FlushOp
+genFlushOp(Fortran::lower::AbstractConverter &converter,
+           Fortran::semantics::SemanticsContext &semaCtx,
+           Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+           const std::optional<Fortran::parser::OmpObjectList> &objectList,
+           const std::optional<std::list<Fortran::parser::OmpMemoryOrderClause>>
+               &clauseList) {
+  llvm::SmallVector<mlir::Value> operandRange;
+  genFlushClauses(converter, semaCtx, objectList, clauseList, loc,
+                  operandRange);
+
+  return converter.getFirOpBuilder().create<mlir::omp::FlushOp>(
+      converter.getCurrentLocation(), operandRange);
+}
+
+static mlir::omp::MasterOp
+genMasterOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
+            Fortran::lower::pft::Evaluation &eval, bool genNested,
+            mlir::Location loc) {
+  return genOpWithBody<mlir::omp::MasterOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(genNested));
+}
+
+static mlir::omp::OrderedOp
+genOrderedOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::semantics::SemanticsContext &semaCtx,
+             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+             const Fortran::parser::OmpClauseList &clauseList) {
+  TODO(loc, "OMPD_ordered");
+  return nullptr;
+}
+
+static mlir::omp::OrderedRegionOp
+genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
+                   Fortran::semantics::SemanticsContext &semaCtx,
+                   Fortran::lower::pft::Evaluation &eval, bool genNested,
+                   mlir::Location loc,
+                   const Fortran::parser::OmpClauseList &clauseList) {
+  mlir::omp::OrderedRegionClauseOps clauseOps;
+  genOrderedRegionClauses(converter, semaCtx, clauseList, loc, clauseOps);
+
+  return genOpWithBody<mlir::omp::OrderedRegionOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(genNested),
+      clauseOps);
+}
+
+static mlir::omp::ParallelOp
+genParallelOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, bool genNested,
+              mlir::Location loc,
+              const Fortran::parser::OmpClauseList &clauseList,
+              bool outerCombined = false) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  Fortran::lower::StatementContext stmtCtx;
+  mlir::omp::ParallelClauseOps clauseOps;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> privateSyms;
+  llvm::SmallVector<mlir::Type> reductionTypes;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSyms;
+  genParallelClauses(converter, semaCtx, stmtCtx, clauseList, loc,
+                     /*processReduction=*/!outerCombined, clauseOps,
+                     reductionTypes, reductionSyms);
+
+  auto reductionCallback = [&](mlir::Operation *op) {
+    genReductionVars(op, converter, loc, reductionSyms, reductionTypes);
+    return reductionSyms;
   };
 
-  // Bind the symbols to their corresponding block arguments.
-  for (auto [argIndex, argSymbol] : llvm::enumerate(mapSyms)) {
-    const mlir::BlockArgument &arg = region.getArgument(argIndex);
-    // Avoid capture of a reference to a structured binding.
-    const Fortran::semantics::Symbol *sym = argSymbol;
-    // Structure component symbols don't have bindings.
-    if (sym->owner().IsDerivedType())
-      continue;
-    fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym);
-    extVal.match(
-        [&](const fir::BoxValue &v) {
-          converter.bindSymbol(*sym,
-                               fir::BoxValue(arg, cloneBounds(v.getLBounds()),
-                                             v.getExplicitParameters(),
-                                             v.getExplicitExtents()));
-        },
-        [&](const fir::MutableBoxValue &v) {
-          converter.bindSymbol(
-              *sym, fir::MutableBoxValue(arg, cloneBounds(v.getLBounds()),
-                                         v.getMutableProperties()));
-        },
-        [&](const fir::ArrayBoxValue &v) {
-          converter.bindSymbol(
-              *sym, fir::ArrayBoxValue(arg, cloneBounds(v.getExtents()),
-                                       cloneBounds(v.getLBounds()),
-                                       v.getSourceBox()));
-        },
-        [&](const fir::CharArrayBoxValue &v) {
-          converter.bindSymbol(
-              *sym, fir::CharArrayBoxValue(arg, cloneBound(v.getLen()),
-                                           cloneBounds(v.getExtents()),
-                                           cloneBounds(v.getLBounds())));
-        },
-        [&](const fir::CharBoxValue &v) {
-          converter.bindSymbol(*sym,
-                               fir::CharBoxValue(arg, cloneBound(v.getLen())));
-        },
-        [&](const fir::UnboxedValue &v) { converter.bindSymbol(*sym, arg); },
-        [&](const auto &) {
-          TODO(converter.getCurrentLocation(),
-               "target map clause operand unsupported type");
-        });
-  }
+  OpWithBodyGenInfo genInfo =
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval)
+          .setGenNested(genNested)
+          .setOuterCombined(outerCombined)
+          .setClauses(&clauseList)
+          .setReductions(&reductionSyms, &reductionTypes)
+          .setGenRegionEntryCb(reductionCallback);
 
-  // Check if cloning the bounds introduced any dependency on the outer region.
-  // If so, then either clone them as well if they are MemoryEffectFree, or else
-  // copy them to a new temporary and add them to the map and block_argument
-  // lists and replace their uses with the new temporary.
-  llvm::SetVector<mlir::Value> valuesDefinedAbove;
-  mlir::getUsedValuesDefinedAbove(region, valuesDefinedAbove);
-  while (!valuesDefinedAbove.empty()) {
-    for (mlir::Value val : valuesDefinedAbove) {
-      mlir::Operation *valOp = val.getDefiningOp();
-      if (mlir::isMemoryEffectFree(valOp)) {
-        mlir::Operation *clonedOp = valOp->clone();
-        regionBlock->push_front(clonedOp);
-        val.replaceUsesWithIf(
-            clonedOp->getResult(0), [regionBlock](mlir::OpOperand &use) {
-              return use.getOwner()->getBlock() == regionBlock;
-            });
-      } else {
-        auto savedIP = firOpBuilder.getInsertionPoint();
-        firOpBuilder.setInsertionPointAfter(valOp);
-        auto copyVal =
-            firOpBuilder.createTemporary(val.getLoc(), val.getType());
-        firOpBuilder.createStoreWithConvert(copyVal.getLoc(), val, copyVal);
+  if (!enableDelayedPrivatization)
+    return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
 
-        llvm::SmallVector<mlir::Value> bounds;
-        std::stringstream name;
-        firOpBuilder.setInsertionPoint(targetOp);
-        mlir::Value mapOp = createMapInfoOp(
-            firOpBuilder, copyVal.getLoc(), copyVal, mlir::Value{}, name.str(),
-            bounds, llvm::SmallVector<mlir::Value>{},
-            static_cast<
-                std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-                llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
-            mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType());
-        targetOp.getMapOperandsMutable().append(mapOp);
-        mlir::Value clonedValArg =
-            region.addArgument(copyVal.getType(), copyVal.getLoc());
-        firOpBuilder.setInsertionPointToStart(regionBlock);
-        auto loadOp = firOpBuilder.create<fir::LoadOp>(clonedValArg.getLoc(),
-                                                       clonedValArg);
-        val.replaceUsesWithIf(
-            loadOp->getResult(0), [regionBlock](mlir::OpOperand &use) {
-              return use.getOwner()->getBlock() == regionBlock;
-            });
-        firOpBuilder.setInsertionPoint(regionBlock, savedIP);
-      }
+  bool privatize = !outerCombined;
+  DataSharingProcessor dsp(converter, semaCtx, clauseList, eval,
+                           /*useDelayedPrivatization=*/true, &symTable);
+
+  if (privatize)
+    dsp.processStep1(&clauseOps, &privateSyms);
+
+  auto genRegionEntryCB = [&](mlir::Operation *op) {
+    auto parallelOp = llvm::cast<mlir::omp::ParallelOp>(op);
+
+    llvm::SmallVector<mlir::Location> reductionLocs(
+        clauseOps.reductionVars.size(), loc);
+
+    mlir::OperandRange privateVars = parallelOp.getPrivateVars();
+    mlir::Region &region = parallelOp.getRegion();
+
+    llvm::SmallVector<mlir::Type> privateVarTypes = reductionTypes;
+    privateVarTypes.reserve(privateVarTypes.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(privateVarTypes),
+                    [](mlir::Value v) { return v.getType(); });
+
+    llvm::SmallVector<mlir::Location> privateVarLocs = reductionLocs;
+    privateVarLocs.reserve(privateVarLocs.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(privateVarLocs),
+                    [](mlir::Value v) { return v.getLoc(); });
+
+    firOpBuilder.createBlock(&region, /*insertPt=*/{}, privateVarTypes,
+                             privateVarLocs);
+
+    llvm::SmallVector<const Fortran::semantics::Symbol *> allSymbols =
+        reductionSyms;
+    allSymbols.append(privateSyms);
+    for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
+      converter.bindSymbol(*arg, prv);
     }
-    valuesDefinedAbove.clear();
-    mlir::getUsedValuesDefinedAbove(region, valuesDefinedAbove);
-  }
 
-  // Insert dummy instruction to remember the insertion position. The
-  // marker will be deleted since there are not uses.
-  // In the HLFIR flow there are hlfir.declares inserted above while
-  // setting block arguments.
-  mlir::Value undefMarker = firOpBuilder.create<fir::UndefOp>(
-      targetOp.getOperation()->getLoc(), firOpBuilder.getIndexType());
+    return allSymbols;
+  };
 
-  // Create blocks for unstructured regions. This has to be done since
-  // blocks are initially allocated with the function as the parent region.
-  if (eval.lowerAsUnstructured()) {
-    Fortran::lower::createEmptyRegionBlocks<mlir::omp::TerminatorOp,
-                                            mlir::omp::YieldOp>(
-        firOpBuilder, eval.getNestedEvaluations());
-  }
+  // TODO Merge with the reduction CB.
+  genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp);
+  return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
+}
 
-  firOpBuilder.create<mlir::omp::TerminatorOp>(currentLocation);
+static mlir::omp::SectionOp
+genSectionOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::semantics::SemanticsContext &semaCtx,
+             Fortran::lower::pft::Evaluation &eval, bool genNested,
+             mlir::Location loc,
+             const Fortran::parser::OmpClauseList &clauseList) {
+  // Currently only private/firstprivate clause is handled, and
+  // all privatization is done within `omp.section` operations.
+  return genOpWithBody<mlir::omp::SectionOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval)
+          .setGenNested(genNested)
+          .setClauses(&clauseList));
+}
 
-  // Create the insertion point after the marker.
-  firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp());
-  if (genNested)
-    genNestedEvaluations(converter, eval);
+static mlir::omp::SectionsOp
+genSectionsOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+              const mlir::omp::SectionsClauseOps &clauseOps) {
+  return genOpWithBody<mlir::omp::SectionsOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(false),
+      clauseOps);
+}
+
+static mlir::omp::SimdLoopOp
+genSimdLoopOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+              const Fortran::parser::OmpClauseList &clauseList) {
+  DataSharingProcessor dsp(converter, semaCtx, clauseList, eval);
+  dsp.processStep1();
+
+  Fortran::lower::StatementContext stmtCtx;
+  mlir::omp::SimdLoopClauseOps clauseOps;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
+  genSimdLoopClauses(converter, semaCtx, stmtCtx, eval, clauseList, loc,
+                     clauseOps, iv);
+
+  auto *nestedEval =
+      getCollapsedLoopEval(eval, Fortran::lower::getCollapseValue(clauseList));
+
+  auto ivCallback = [&](mlir::Operation *op) {
+    return genLoopVars(op, converter, loc, iv);
+  };
+
+  return genOpWithBody<mlir::omp::SimdLoopOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
+          .setClauses(&clauseList)
+          .setDataSharingProcessor(&dsp)
+          .setGenRegionEntryCb(ivCallback),
+      clauseOps);
+}
+
+static mlir::omp::SingleOp
+genSingleOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
+            Fortran::lower::pft::Evaluation &eval, bool genNested,
+            mlir::Location loc,
+            const Fortran::parser::OmpClauseList &beginClauseList,
+            const Fortran::parser::OmpClauseList &endClauseList) {
+  mlir::omp::SingleClauseOps clauseOps;
+  genSingleClauses(converter, semaCtx, beginClauseList, endClauseList, loc,
+                   clauseOps);
+
+  return genOpWithBody<mlir::omp::SingleOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval)
+          .setGenNested(genNested)
+          .setClauses(&beginClauseList),
+      clauseOps);
 }
 
 static mlir::omp::TargetOp
 genTargetOp(Fortran::lower::AbstractConverter &converter,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, bool genNested,
-            mlir::Location currentLocation,
+            mlir::Location loc,
             const Fortran::parser::OmpClauseList &clauseList,
-            llvm::omp::Directive directive, bool outerCombined = false) {
+            bool outerCombined = false) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
+
+  bool processHostOnlyClauses =
+      !llvm::cast<mlir::omp::OffloadModuleInterface>(*converter.getModuleOp())
+           .getIsTargetDevice();
+
   mlir::omp::TargetClauseOps clauseOps;
-  llvm::SmallVector<mlir::Type> mapTypes, devicePtrTypes, deviceAddrTypes;
-  llvm::SmallVector<mlir::Location> mapLocs, devicePtrLocs, deviceAddrLocs;
   llvm::SmallVector<const Fortran::semantics::Symbol *> mapSyms, devicePtrSyms,
       deviceAddrSyms;
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(llvm::omp::Directive::OMPD_target, clauseOps);
-  cp.processDevice(stmtCtx, clauseOps);
-  cp.processThreadLimit(stmtCtx, clauseOps);
-  cp.processDepend(clauseOps);
-  cp.processNowait(clauseOps);
-  cp.processMap(currentLocation, directive, stmtCtx, clauseOps, &mapSyms,
-                &mapLocs, &mapTypes);
-  cp.processIsDevicePtr(clauseOps, devicePtrTypes, devicePtrLocs,
-                        devicePtrSyms);
-  cp.processHasDeviceAddr(clauseOps, deviceAddrTypes, deviceAddrLocs,
-                          deviceAddrSyms);
-  // TODO Support delayed privatization.
-
-  cp.processTODO<clause::Private, clause::Firstprivate, clause::Reduction,
-                 clause::InReduction, clause::Allocate, clause::UsesAllocators,
-                 clause::Defaultmap>(currentLocation,
-                                     llvm::omp::Directive::OMPD_target);
+  llvm::SmallVector<mlir::Location> mapLocs, devicePtrLocs, deviceAddrLocs;
+  llvm::SmallVector<mlir::Type> mapTypes, devicePtrTypes, deviceAddrTypes;
+  genTargetClauses(converter, semaCtx, stmtCtx, clauseList, loc,
+                   processHostOnlyClauses, /*processReduction=*/outerCombined,
+                   clauseOps, mapSyms, mapLocs, mapTypes, deviceAddrSyms,
+                   deviceAddrLocs, deviceAddrTypes, devicePtrSyms,
+                   devicePtrLocs, devicePtrTypes);
 
   // 5.8.1 Implicit Data-Mapping Attribute Rules
   // The following code follows the implicit data-mapping rules to map all the
@@ -1278,22 +1515,21 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
         fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(sym);
         name << sym.name().ToString();
 
-        Fortran::lower::AddrAndBoundsInfo info =
-            getDataOperandBaseAddr(converter, converter.getFirOpBuilder(), sym,
-                                   converter.getCurrentLocation());
+        Fortran::lower::AddrAndBoundsInfo info = getDataOperandBaseAddr(
+            converter, firOpBuilder, sym, converter.getCurrentLocation());
         if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>())
           bounds =
               Fortran::lower::genBoundsOpsFromBox<mlir::omp::MapBoundsOp,
                                                   mlir::omp::MapBoundsType>(
-                  converter.getFirOpBuilder(), converter.getCurrentLocation(),
-                  converter, dataExv, info);
+                  firOpBuilder, converter.getCurrentLocation(), converter,
+                  dataExv, info);
         if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>()) {
           bool dataExvIsAssumedSize =
               Fortran::semantics::IsAssumedSizeArray(sym.GetUltimate());
           bounds = Fortran::lower::genBaseBoundsOps<mlir::omp::MapBoundsOp,
                                                     mlir::omp::MapBoundsType>(
-              converter.getFirOpBuilder(), converter.getCurrentLocation(),
-              converter, dataExv, dataExvIsAssumedSize);
+              firOpBuilder, converter.getCurrentLocation(), converter, dataExv,
+              dataExvIsAssumedSize);
         }
 
         llvm::omp::OpenMPOffloadMappingFlags mapFlag =
@@ -1307,7 +1543,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
 
         // If a variable is specified in declare target link and if device
         // type is not specified as `nohost`, it needs to be mapped tofrom
-        mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
+        mlir::ModuleOp mod = firOpBuilder.getModule();
         mlir::Operation *op = mod.lookupSymbol(converter.mangleName(sym));
         auto declareTargetOp =
             llvm::dyn_cast_if_present<mlir::omp::DeclareTargetInterface>(op);
@@ -1327,8 +1563,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
         }
 
         mlir::Value mapOp = createMapInfoOp(
-            converter.getFirOpBuilder(), baseOp.getLoc(), baseOp, mlir::Value{},
-            name.str(), bounds, {},
+            firOpBuilder, baseOp.getLoc(), baseOp, mlir::Value{}, name.str(),
+            bounds, {},
             static_cast<
                 std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
                 mapFlag),
@@ -1341,340 +1577,146 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
       }
     }
   };
-  Fortran::lower::pft::visitAllSymbols(eval, captureImplicitMap);
-
-  auto targetOp = converter.getFirOpBuilder().create<mlir::omp::TargetOp>(
-      currentLocation, clauseOps);
-
-  genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSyms,
-                    mapLocs, mapTypes, currentLocation);
-
-  return targetOp;
-}
-
-static mlir::omp::TeamsOp
-genTeamsOp(Fortran::lower::AbstractConverter &converter,
-           Fortran::semantics::SemanticsContext &semaCtx,
-           Fortran::lower::pft::Evaluation &eval, bool genNested,
-           mlir::Location currentLocation,
-           const Fortran::parser::OmpClauseList &clauseList,
-           bool outerCombined = false) {
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::TeamsClauseOps clauseOps;
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(llvm::omp::Directive::OMPD_teams, clauseOps);
-  cp.processAllocate(clauseOps);
-  cp.processDefault();
-  cp.processNumTeams(stmtCtx, clauseOps);
-  cp.processThreadLimit(stmtCtx, clauseOps);
-  // TODO Support delayed privatization.
-
-  cp.processTODO<clause::Reduction>(currentLocation,
-                                    llvm::omp::Directive::OMPD_teams);
-
-  return genOpWithBody<mlir::omp::TeamsOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setOuterCombined(outerCombined)
-          .setClauses(&clauseList),
-      clauseOps);
-}
-
-/// Extract the list of function and variable symbols affected by the given
-/// 'declare target' directive and return the intended device type for them.
-static void getDeclareTargetInfo(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::semantics::SemanticsContext &semaCtx,
-    Fortran::lower::pft::Evaluation &eval,
-    const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
-    mlir::omp::DeclareTargetClauseOps &clauseOps,
-    llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
-  const auto &spec = std::get<Fortran::parser::OmpDeclareTargetSpecifier>(
-      declareTargetConstruct.t);
-  if (const auto *objectList{
-          Fortran::parser::Unwrap<Fortran::parser::OmpObjectList>(spec.u)}) {
-    ObjectList objects{makeObjects(*objectList, semaCtx)};
-    // Case: declare target(func, var1, var2)
-    gatherFuncAndVarSyms(objects, mlir::omp::DeclareTargetCaptureClause::to,
-                         symbolAndClause);
-  } else if (const auto *clauseList{
-                 Fortran::parser::Unwrap<Fortran::parser::OmpClauseList>(
-                     spec.u)}) {
-    if (clauseList->v.empty()) {
-      // Case: declare target, implicit capture of function
-      symbolAndClause.emplace_back(
-          mlir::omp::DeclareTargetCaptureClause::to,
-          eval.getOwningProcedure()->getSubprogramSymbol());
-    }
-
-    ClauseProcessor cp(converter, semaCtx, *clauseList);
-    cp.processTo(symbolAndClause);
-    cp.processEnter(symbolAndClause);
-    cp.processLink(symbolAndClause);
-    cp.processDeviceType(clauseOps);
-    cp.processTODO<clause::Indirect>(converter.getCurrentLocation(),
-                                     llvm::omp::Directive::OMPD_declare_target);
-  }
-}
-
-static void collectDeferredDeclareTargets(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::semantics::SemanticsContext &semaCtx,
-    Fortran::lower::pft::Evaluation &eval,
-    const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
-    llvm::SmallVectorImpl<Fortran::lower::OMPDeferredDeclareTargetInfo>
-        &deferredDeclareTarget) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
-  llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
-  getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
-                       clauseOps, symbolAndClause);
-  // Return the device type only if at least one of the targets for the
-  // directive is a function or subroutine
-  mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
-
-  for (const DeclareTargetCapturePair &symClause : symbolAndClause) {
-    mlir::Operation *op = mod.lookupSymbol(converter.mangleName(
-        std::get<const Fortran::semantics::Symbol &>(symClause)));
-
-    if (!op) {
-      deferredDeclareTarget.push_back({std::get<0>(symClause),
-                                       clauseOps.deviceType,
-                                       std::get<1>(symClause)});
-    }
-  }
-}
-
-static std::optional<mlir::omp::DeclareTargetDeviceType>
-getDeclareTargetFunctionDevice(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::semantics::SemanticsContext &semaCtx,
-    Fortran::lower::pft::Evaluation &eval,
-    const Fortran::parser::OpenMPDeclareTargetConstruct
-        &declareTargetConstruct) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
-  llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
-  getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
-                       clauseOps, symbolAndClause);
-
-  // Return the device type only if at least one of the targets for the
-  // directive is a function or subroutine
-  mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
-  for (const DeclareTargetCapturePair &symClause : symbolAndClause) {
-    mlir::Operation *op = mod.lookupSymbol(converter.mangleName(
-        std::get<const Fortran::semantics::Symbol &>(symClause)));
-
-    if (mlir::isa_and_nonnull<mlir::func::FuncOp>(op))
-      return clauseOps.deviceType;
-  }
-
-  return std::nullopt;
-}
-
-//===----------------------------------------------------------------------===//
-// genOMP() Code generation helper functions
-//===----------------------------------------------------------------------===//
-
-static void
-genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
-                       Fortran::semantics::SemanticsContext &semaCtx,
-                       Fortran::lower::pft::Evaluation &eval, bool genNested,
-                       const Fortran::parser::OpenMPSimpleStandaloneConstruct
-                           &simpleStandaloneConstruct) {
-  const auto &directive =
-      std::get<Fortran::parser::OmpSimpleStandaloneDirective>(
-          simpleStandaloneConstruct.t);
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  const auto &opClauseList =
-      std::get<Fortran::parser::OmpClauseList>(simpleStandaloneConstruct.t);
-  mlir::Location currentLocation = converter.genLocation(directive.source);
-
-  switch (directive.v) {
-  default:
-    break;
-  case llvm::omp::Directive::OMPD_barrier:
-    firOpBuilder.create<mlir::omp::BarrierOp>(currentLocation);
-    break;
-  case llvm::omp::Directive::OMPD_taskwait: {
-    mlir::omp::TaskwaitClauseOps clauseOps;
-    ClauseProcessor cp(converter, semaCtx, opClauseList);
-    cp.processTODO<clause::Depend, clause::Nowait>(
-        currentLocation, llvm::omp::Directive::OMPD_taskwait);
-    firOpBuilder.create<mlir::omp::TaskwaitOp>(currentLocation, clauseOps);
-    break;
-  }
-  case llvm::omp::Directive::OMPD_taskyield:
-    firOpBuilder.create<mlir::omp::TaskyieldOp>(currentLocation);
-    break;
-  case llvm::omp::Directive::OMPD_target_data:
-    genTargetDataOp(converter, semaCtx, eval, genNested, currentLocation,
-                    opClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_target_enter_data:
-    genTargetEnterExitDataUpdateOp<mlir::omp::TargetEnterDataOp>(
-        converter, semaCtx, currentLocation, opClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_target_exit_data:
-    genTargetEnterExitDataUpdateOp<mlir::omp::TargetExitDataOp>(
-        converter, semaCtx, currentLocation, opClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_target_update:
-    genTargetEnterExitDataUpdateOp<mlir::omp::TargetUpdateOp>(
-        converter, semaCtx, currentLocation, opClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_ordered:
-    TODO(currentLocation, "OMPD_ordered");
-  }
-}
-
-static void
-genOmpFlush(Fortran::lower::AbstractConverter &converter,
-            Fortran::semantics::SemanticsContext &semaCtx,
-            Fortran::lower::pft::Evaluation &eval,
-            const Fortran::parser::OpenMPFlushConstruct &flushConstruct) {
-  llvm::SmallVector<mlir::Value, 4> operandRange;
-  if (const auto &ompObjectList =
-          std::get<std::optional<Fortran::parser::OmpObjectList>>(
-              flushConstruct.t))
-    genObjectList2(*ompObjectList, converter, operandRange);
-  const auto &memOrderClause =
-      std::get<std::optional<std::list<Fortran::parser::OmpMemoryOrderClause>>>(
-          flushConstruct.t);
-  if (memOrderClause && memOrderClause->size() > 0)
-    TODO(converter.getCurrentLocation(), "Handle OmpMemoryOrderClause");
-  converter.getFirOpBuilder().create<mlir::omp::FlushOp>(
-      converter.getCurrentLocation(), operandRange);
-}
-
-static llvm::SmallVector<const Fortran::semantics::Symbol *>
-genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
-            mlir::Location &loc,
-            llvm::ArrayRef<const Fortran::semantics::Symbol *> args) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  auto &region = op->getRegion(0);
-
-  std::size_t loopVarTypeSize = 0;
-  for (const Fortran::semantics::Symbol *arg : args)
-    loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
-  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-  llvm::SmallVector<mlir::Type> tiv(args.size(), loopVarType);
-  llvm::SmallVector<mlir::Location> locs(args.size(), loc);
-  firOpBuilder.createBlock(&region, {}, tiv, locs);
-  // The argument is not currently in memory, so make a temporary for the
-  // argument, and store it there, then bind that location to the argument.
-  mlir::Operation *storeOp = nullptr;
-  for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
-    mlir::Value indexVal = fir::getBase(region.front().getArgument(argIndex));
-    storeOp =
-        createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
-  }
-  firOpBuilder.setInsertionPointAfter(storeOp);
-
-  return llvm::SmallVector<const Fortran::semantics::Symbol *>(args);
-}
-
-static llvm::SmallVector<const Fortran::semantics::Symbol *>
-genLoopAndReductionVars(
-    mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
-    mlir::Location &loc,
-    llvm::ArrayRef<const Fortran::semantics::Symbol *> loopArgs,
-    llvm::ArrayRef<const Fortran::semantics::Symbol *> reductionArgs,
-    llvm::ArrayRef<mlir::Type> reductionTypes) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-
-  llvm::SmallVector<mlir::Type> blockArgTypes;
-  llvm::SmallVector<mlir::Location> blockArgLocs;
-  blockArgTypes.reserve(loopArgs.size() + reductionArgs.size());
-  blockArgLocs.reserve(blockArgTypes.size());
-  mlir::Block *entryBlock;
-
-  if (loopArgs.size()) {
-    std::size_t loopVarTypeSize = 0;
-    for (const Fortran::semantics::Symbol *arg : loopArgs)
-      loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
-    mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-    std::fill_n(std::back_inserter(blockArgTypes), loopArgs.size(),
-                loopVarType);
-    std::fill_n(std::back_inserter(blockArgLocs), loopArgs.size(), loc);
-  }
-  if (reductionArgs.size()) {
-    llvm::copy(reductionTypes, std::back_inserter(blockArgTypes));
-    std::fill_n(std::back_inserter(blockArgLocs), reductionArgs.size(), loc);
-  }
-  entryBlock = firOpBuilder.createBlock(&op->getRegion(0), {}, blockArgTypes,
-                                        blockArgLocs);
-  // The argument is not currently in memory, so make a temporary for the
-  // argument, and store it there, then bind that location to the argument.
-  if (loopArgs.size()) {
-    mlir::Operation *storeOp = nullptr;
-    for (auto [argIndex, argSymbol] : llvm::enumerate(loopArgs)) {
-      mlir::Value indexVal =
-          fir::getBase(op->getRegion(0).front().getArgument(argIndex));
-      storeOp =
-          createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
-    }
-    firOpBuilder.setInsertionPointAfter(storeOp);
-  }
-  // Bind the reduction arguments to their block arguments
-  for (auto [arg, prv] : llvm::zip_equal(
-           reductionArgs,
-           llvm::drop_begin(entryBlock->getArguments(), loopArgs.size()))) {
-    converter.bindSymbol(*arg, prv);
-  }
+  Fortran::lower::pft::visitAllSymbols(eval, captureImplicitMap);
 
-  return llvm::SmallVector<const Fortran::semantics::Symbol *>(loopArgs);
+  auto targetOp = firOpBuilder.create<mlir::omp::TargetOp>(loc, clauseOps);
+  genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSyms,
+                    mapLocs, mapTypes, loc);
+  return targetOp;
 }
 
-static void
-createSimdLoop(Fortran::lower::AbstractConverter &converter,
-               Fortran::semantics::SemanticsContext &semaCtx,
-               Fortran::lower::pft::Evaluation &eval,
-               llvm::omp::Directive ompDirective,
-               const Fortran::parser::OmpClauseList &loopOpClauseList,
-               mlir::Location loc) {
+static mlir::omp::TargetDataOp
+genTargetDataOp(Fortran::lower::AbstractConverter &converter,
+                Fortran::semantics::SemanticsContext &semaCtx,
+                Fortran::lower::pft::Evaluation &eval, bool genNested,
+                mlir::Location loc,
+                const Fortran::parser::OmpClauseList &clauseList) {
+  Fortran::lower::StatementContext stmtCtx;
+  mlir::omp::TargetDataClauseOps clauseOps;
+  llvm::SmallVector<mlir::Type> useDeviceTypes;
+  llvm::SmallVector<mlir::Location> useDeviceLocs;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> useDeviceSyms;
+  genTargetDataClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps,
+                       useDeviceTypes, useDeviceLocs, useDeviceSyms);
+
+  auto targetDataOp =
+      converter.getFirOpBuilder().create<mlir::omp::TargetDataOp>(loc,
+                                                                  clauseOps);
+  genBodyOfTargetDataOp(converter, semaCtx, eval, genNested, targetDataOp,
+                        useDeviceTypes, useDeviceLocs, useDeviceSyms, loc);
+  return targetDataOp;
+}
+
+template <typename OpTy>
+static OpTy genTargetEnterExitUpdateDataOp(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx, mlir::Location loc,
+    const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  DataSharingProcessor dsp(converter, semaCtx, loopOpClauseList, eval);
-  dsp.processStep1();
+  Fortran::lower::StatementContext stmtCtx;
+
+  // GCC 9.3.0 emits a (probably) bogus warning about an unused variable.
+  [[maybe_unused]] llvm::omp::Directive directive;
+  if constexpr (std::is_same_v<OpTy, mlir::omp::TargetEnterDataOp>) {
+    directive = llvm::omp::Directive::OMPD_target_enter_data;
+  } else if constexpr (std::is_same_v<OpTy, mlir::omp::TargetExitDataOp>) {
+    directive = llvm::omp::Directive::OMPD_target_exit_data;
+  } else if constexpr (std::is_same_v<OpTy, mlir::omp::TargetUpdateOp>) {
+    directive = llvm::omp::Directive::OMPD_target_update;
+  } else {
+    llvm_unreachable("Unexpected TARGET DATA construct");
+  }
+
+  mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps;
+  genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx, clauseList,
+                                      loc, directive, clauseOps);
+
+  return firOpBuilder.create<OpTy>(loc, clauseOps);
+}
 
+static mlir::omp::TaskOp
+genTaskOp(Fortran::lower::AbstractConverter &converter,
+          Fortran::semantics::SemanticsContext &semaCtx,
+          Fortran::lower::pft::Evaluation &eval, bool genNested,
+          mlir::Location loc,
+          const Fortran::parser::OmpClauseList &clauseList) {
   Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::SimdLoopClauseOps clauseOps;
-  llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
+  mlir::omp::TaskClauseOps clauseOps;
+  genTaskClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps);
 
-  ClauseProcessor cp(converter, semaCtx, loopOpClauseList);
-  cp.processCollapse(loc, eval, clauseOps, iv);
-  cp.processReduction(loc, clauseOps);
-  cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
-  cp.processSimdlen(clauseOps);
-  cp.processSafelen(clauseOps);
-  clauseOps.loopInclusiveAttr = firOpBuilder.getUnitAttr();
-  // TODO Support delayed privatization.
+  return genOpWithBody<mlir::omp::TaskOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval)
+          .setGenNested(genNested)
+          .setClauses(&clauseList),
+      clauseOps);
+}
 
-  cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
-                 clause::Nontemporal, clause::Order>(loc, ompDirective);
+static mlir::omp::TaskgroupOp
+genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
+               Fortran::semantics::SemanticsContext &semaCtx,
+               Fortran::lower::pft::Evaluation &eval, bool genNested,
+               mlir::Location loc,
+               const Fortran::parser::OmpClauseList &clauseList) {
+  mlir::omp::TaskgroupClauseOps clauseOps;
+  genTaskgroupClauses(converter, semaCtx, clauseList, loc, clauseOps);
 
-  auto *nestedEval = getCollapsedLoopEval(
-      eval, Fortran::lower::getCollapseValue(loopOpClauseList));
+  return genOpWithBody<mlir::omp::TaskgroupOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval)
+          .setGenNested(genNested)
+          .setClauses(&clauseList),
+      clauseOps);
+}
 
-  auto ivCallback = [&](mlir::Operation *op) {
-    return genLoopVars(op, converter, loc, iv);
-  };
+static mlir::omp::TaskloopOp
+genTaskloopOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+              const Fortran::parser::OmpClauseList &clauseList) {
+  TODO(loc, "Taskloop construct");
+}
 
-  genOpWithBody<mlir::omp::SimdLoopOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
-          .setClauses(&loopOpClauseList)
-          .setDataSharingProcessor(&dsp)
-          .setGenRegionEntryCb(ivCallback),
+static mlir::omp::TaskwaitOp
+genTaskwaitOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+              const Fortran::parser::OmpClauseList &clauseList) {
+  mlir::omp::TaskwaitClauseOps clauseOps;
+  genTaskwaitClauses(converter, semaCtx, clauseList, loc, clauseOps);
+  return converter.getFirOpBuilder().create<mlir::omp::TaskwaitOp>(loc,
+                                                                   clauseOps);
+}
+
+static mlir::omp::TaskyieldOp
+genTaskyieldOp(Fortran::lower::AbstractConverter &converter,
+               Fortran::semantics::SemanticsContext &semaCtx,
+               Fortran::lower::pft::Evaluation &eval, mlir::Location loc) {
+  return converter.getFirOpBuilder().create<mlir::omp::TaskyieldOp>(loc);
+}
+
+static mlir::omp::TeamsOp
+genTeamsOp(Fortran::lower::AbstractConverter &converter,
+           Fortran::semantics::SemanticsContext &semaCtx,
+           Fortran::lower::pft::Evaluation &eval, bool genNested,
+           mlir::Location loc, const Fortran::parser::OmpClauseList &clauseList,
+           bool outerCombined = false) {
+  Fortran::lower::StatementContext stmtCtx;
+  mlir::omp::TeamsClauseOps clauseOps;
+  genTeamsClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps);
+
+  return genOpWithBody<mlir::omp::TeamsOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval)
+          .setGenNested(genNested)
+          .setOuterCombined(outerCombined)
+          .setClauses(&clauseList),
       clauseOps);
 }
 
-static void createWsloop(Fortran::lower::AbstractConverter &converter,
-                         Fortran::semantics::SemanticsContext &semaCtx,
-                         Fortran::lower::pft::Evaluation &eval,
-                         llvm::omp::Directive ompDirective,
-                         const Fortran::parser::OmpClauseList &beginClauseList,
-                         const Fortran::parser::OmpClauseList *endClauseList,
-                         mlir::Location loc) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+static mlir::omp::WsloopOp
+genWsloopOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
+            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+            const Fortran::parser::OmpClauseList &beginClauseList,
+            const Fortran::parser::OmpClauseList *endClauseList) {
   DataSharingProcessor dsp(converter, semaCtx, beginClauseList, eval);
   dsp.processStep1();
 
@@ -1683,30 +1725,9 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
   llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSyms;
-
-  ClauseProcessor cp(converter, semaCtx, beginClauseList);
-  cp.processCollapse(loc, eval, clauseOps, iv);
-  cp.processSchedule(stmtCtx, clauseOps);
-  cp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms);
-  cp.processOrdered(clauseOps);
-  clauseOps.loopInclusiveAttr = firOpBuilder.getUnitAttr();
-  // TODO Support delayed privatization.
-
-  if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars))
-    clauseOps.reductionByRefAttr = firOpBuilder.getUnitAttr();
-
-  cp.processTODO<clause::Allocate, clause::Linear, clause::Order>(loc,
-                                                                  ompDirective);
-
-  // In FORTRAN `nowait` clause occur at the end of `omp do` directive.
-  // i.e
-  // !$omp do
-  // <...>
-  // !$omp end do nowait
-  if (endClauseList) {
-    ClauseProcessor ecp(converter, semaCtx, *endClauseList);
-    ecp.processNowait(clauseOps);
-  }
+  genWsloopClauses(converter, semaCtx, stmtCtx, eval, beginClauseList,
+                   endClauseList, loc, clauseOps, iv, reductionTypes,
+                   reductionSyms);
 
   auto *nestedEval = getCollapsedLoopEval(
       eval, Fortran::lower::getCollapseValue(beginClauseList));
@@ -1716,7 +1737,7 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter,
                                    reductionTypes);
   };
 
-  genOpWithBody<mlir::omp::WsloopOp>(
+  return genOpWithBody<mlir::omp::WsloopOp>(
       OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
           .setClauses(&beginClauseList)
           .setDataSharingProcessor(&dsp)
@@ -1725,7 +1746,11 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter,
       clauseOps);
 }
 
-static void createSimdWsloop(
+//===----------------------------------------------------------------------===//
+// Code generation functions for composite constructs
+//===----------------------------------------------------------------------===//
+
+static void genCompositeDoSimd(
     Fortran::lower::AbstractConverter &converter,
     Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, llvm::omp::Directive ompDirective,
@@ -1733,7 +1758,7 @@ static void createSimdWsloop(
     const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) {
   ClauseProcessor cp(converter, semaCtx, beginClauseList);
   cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
-                 clause::Safelen, clause::Simdlen, clause::Order>(loc,
+                 clause::Order, clause::Safelen, clause::Simdlen>(loc,
                                                                   ompDirective);
   // TODO: Add support for vectorization - add vectorization hints inside loop
   // body.
@@ -1743,34 +1768,7 @@ static void createSimdWsloop(
   // When support for vectorization is enabled, then we need to add handling of
   // if clause. Currently if clause can be skipped because we always assume
   // SIMD length = 1.
-  createWsloop(converter, semaCtx, eval, ompDirective, beginClauseList,
-               endClauseList, loc);
-}
-
-static void
-markDeclareTarget(mlir::Operation *op,
-                  Fortran::lower::AbstractConverter &converter,
-                  mlir::omp::DeclareTargetCaptureClause captureClause,
-                  mlir::omp::DeclareTargetDeviceType deviceType) {
-  // TODO: Add support for program local variables with declare target applied
-  auto declareTargetOp = llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(op);
-  if (!declareTargetOp)
-    fir::emitFatalError(
-        converter.getCurrentLocation(),
-        "Attempt to apply declare target on unsupported operation");
-
-  // The function or global already has a declare target applied to it, very
-  // likely through implicit capture (usage in another declare target
-  // function/subroutine). It should be marked as any if it has been assigned
-  // both host and nohost, else we skip, as there is no change
-  if (declareTargetOp.isDeclareTarget()) {
-    if (declareTargetOp.getDeclareTargetDeviceType() != deviceType)
-      declareTargetOp.setDeclareTarget(mlir::omp::DeclareTargetDeviceType::any,
-                                       captureClause);
-    return;
-  }
-
-  declareTargetOp.setDeclareTarget(deviceType, captureClause);
+  genWsloopOp(converter, semaCtx, eval, loc, beginClauseList, endClauseList);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1865,6 +1863,102 @@ genOMP(Fortran::lower::AbstractConverter &converter,
       ompDeclConstruct.u);
 }
 
+//===----------------------------------------------------------------------===//
+// OpenMPStandaloneConstruct visitors
+//===----------------------------------------------------------------------===//
+
+static void genOMP(Fortran::lower::AbstractConverter &converter,
+                   Fortran::lower::SymMap &symTable,
+                   Fortran::semantics::SemanticsContext &semaCtx,
+                   Fortran::lower::pft::Evaluation &eval,
+                   const Fortran::parser::OpenMPSimpleStandaloneConstruct
+                       &simpleStandaloneConstruct) {
+  const auto &directive =
+      std::get<Fortran::parser::OmpSimpleStandaloneDirective>(
+          simpleStandaloneConstruct.t);
+  const auto &clauseList =
+      std::get<Fortran::parser::OmpClauseList>(simpleStandaloneConstruct.t);
+  mlir::Location currentLocation = converter.genLocation(directive.source);
+
+  switch (directive.v) {
+  default:
+    break;
+  case llvm::omp::Directive::OMPD_barrier:
+    genBarrierOp(converter, semaCtx, eval, currentLocation);
+    break;
+  case llvm::omp::Directive::OMPD_taskwait:
+    genTaskwaitOp(converter, semaCtx, eval, currentLocation, clauseList);
+    break;
+  case llvm::omp::Directive::OMPD_taskyield:
+    genTaskyieldOp(converter, semaCtx, eval, currentLocation);
+    break;
+  case llvm::omp::Directive::OMPD_target_data:
+    genTargetDataOp(converter, semaCtx, eval, /*genNested=*/true,
+                    currentLocation, clauseList);
+    break;
+  case llvm::omp::Directive::OMPD_target_enter_data:
+    genTargetEnterExitUpdateDataOp<mlir::omp::TargetEnterDataOp>(
+        converter, semaCtx, currentLocation, clauseList);
+    break;
+  case llvm::omp::Directive::OMPD_target_exit_data:
+    genTargetEnterExitUpdateDataOp<mlir::omp::TargetExitDataOp>(
+        converter, semaCtx, currentLocation, clauseList);
+    break;
+  case llvm::omp::Directive::OMPD_target_update:
+    genTargetEnterExitUpdateDataOp<mlir::omp::TargetUpdateOp>(
+        converter, semaCtx, currentLocation, clauseList);
+    break;
+  case llvm::omp::Directive::OMPD_ordered:
+    genOrderedOp(converter, semaCtx, eval, currentLocation, clauseList);
+    break;
+  }
+}
+
+static void
+genOMP(Fortran::lower::AbstractConverter &converter,
+       Fortran::lower::SymMap &symTable,
+       Fortran::semantics::SemanticsContext &semaCtx,
+       Fortran::lower::pft::Evaluation &eval,
+       const Fortran::parser::OpenMPFlushConstruct &flushConstruct) {
+  const auto &verbatim = std::get<Fortran::parser::Verbatim>(flushConstruct.t);
+  const auto &objectList =
+      std::get<std::optional<Fortran::parser::OmpObjectList>>(flushConstruct.t);
+  const auto &clauseList =
+      std::get<std::optional<std::list<Fortran::parser::OmpMemoryOrderClause>>>(
+          flushConstruct.t);
+  mlir::Location currentLocation = converter.genLocation(verbatim.source);
+  genFlushOp(converter, semaCtx, eval, currentLocation, objectList, clauseList);
+}
+
+static void
+genOMP(Fortran::lower::AbstractConverter &converter,
+       Fortran::lower::SymMap &symTable,
+       Fortran::semantics::SemanticsContext &semaCtx,
+       Fortran::lower::pft::Evaluation &eval,
+       const Fortran::parser::OpenMPCancelConstruct &cancelConstruct) {
+  TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct");
+}
+
+static void genOMP(Fortran::lower::AbstractConverter &converter,
+                   Fortran::lower::SymMap &symTable,
+                   Fortran::semantics::SemanticsContext &semaCtx,
+                   Fortran::lower::pft::Evaluation &eval,
+                   const Fortran::parser::OpenMPCancellationPointConstruct
+                       &cancellationPointConstruct) {
+  TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct");
+}
+
+static void
+genOMP(Fortran::lower::AbstractConverter &converter,
+       Fortran::lower::SymMap &symTable,
+       Fortran::semantics::SemanticsContext &semaCtx,
+       Fortran::lower::pft::Evaluation &eval,
+       const Fortran::parser::OpenMPStandaloneConstruct &standaloneConstruct) {
+  std::visit(
+      [&](auto &&s) { return genOMP(converter, symTable, semaCtx, eval, s); },
+      standaloneConstruct.u);
+}
+
 //===----------------------------------------------------------------------===//
 // OpenMPConstruct visitors
 //===----------------------------------------------------------------------===//
@@ -1996,7 +2090,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
     break;
   case llvm::omp::Directive::OMPD_target:
     genTargetOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-                beginClauseList, directive.v);
+                beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_data:
     genTargetDataOp(converter, semaCtx, eval, /*genNested=*/true,
@@ -2012,8 +2106,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
     break;
   case llvm::omp::Directive::OMPD_teams:
     genTeamsOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-               beginClauseList,
-               /*outerCombined=*/false);
+               beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_workshare:
     // FIXME: Workshare is not a commonly used OpenMP construct, an
@@ -2035,8 +2128,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   if ((llvm::omp::allTargetSet & llvm::omp::blockConstructSet)
           .test(directive.v)) {
     genTargetOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
-                beginClauseList, directive.v,
-                /*outerCombined=*/true);
+                beginClauseList, /*outerCombined=*/true);
     combinedDirective = true;
   }
   if ((llvm::omp::allTeamsSet & llvm::omp::blockConstructSet)
@@ -2073,44 +2165,13 @@ genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPCriticalConstruct &criticalConstruct) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  mlir::Location currentLocation = converter.getCurrentLocation();
-  std::string name;
-  const Fortran::parser::OmpCriticalDirective &cd =
+  const auto &cd =
       std::get<Fortran::parser::OmpCriticalDirective>(criticalConstruct.t);
-  if (std::get<std::optional<Fortran::parser::Name>>(cd.t).has_value()) {
-    name =
-        std::get<std::optional<Fortran::parser::Name>>(cd.t).value().ToString();
-  }
-
-  mlir::omp::CriticalOp criticalOp = [&]() {
-    if (name.empty()) {
-      return firOpBuilder.create<mlir::omp::CriticalOp>(
-          currentLocation, mlir::FlatSymbolRefAttr());
-    }
-
-    mlir::ModuleOp module = firOpBuilder.getModule();
-    mlir::OpBuilder modBuilder(module.getBodyRegion());
-    auto global = module.lookupSymbol<mlir::omp::CriticalDeclareOp>(name);
-    if (!global) {
-      mlir::omp::CriticalClauseOps clauseOps;
-      const auto &clauseList = std::get<Fortran::parser::OmpClauseList>(cd.t);
-
-      ClauseProcessor cp(converter, semaCtx, clauseList);
-      cp.processHint(clauseOps);
-      clauseOps.nameAttr =
-          mlir::StringAttr::get(firOpBuilder.getContext(), name);
-
-      global = modBuilder.create<mlir::omp::CriticalDeclareOp>(currentLocation,
-                                                               clauseOps);
-    }
-
-    return firOpBuilder.create<mlir::omp::CriticalOp>(
-        currentLocation, mlir::FlatSymbolRefAttr::get(firOpBuilder.getContext(),
-                                                      global.getSymName()));
-  }();
-  auto genInfo = OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval);
-  createBodyOfOp<mlir::omp::CriticalOp>(criticalOp, genInfo);
+  const auto &clauseList = std::get<Fortran::parser::OmpClauseList>(cd.t);
+  const auto &name = std::get<std::optional<Fortran::parser::Name>>(cd.t);
+  mlir::Location currentLocation = converter.getCurrentLocation();
+  genCriticalOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
+                clauseList, name);
 }
 
 static void
@@ -2129,7 +2190,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
                    const Fortran::parser::OpenMPLoopConstruct &loopConstruct) {
   const auto &beginLoopDirective =
       std::get<Fortran::parser::OmpBeginLoopDirective>(loopConstruct.t);
-  const auto &loopOpClauseList =
+  const auto &beginClauseList =
       std::get<Fortran::parser::OmpClauseList>(beginLoopDirective.t);
   mlir::Location currentLocation =
       converter.genLocation(beginLoopDirective.source);
@@ -2150,33 +2211,31 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
   bool validDirective = false;
   if (llvm::omp::topTaskloopSet.test(ompDirective)) {
     validDirective = true;
-    TODO(currentLocation, "Taskloop construct");
+    genTaskloopOp(converter, semaCtx, eval, currentLocation, beginClauseList);
   } else {
     // Create omp.{target, teams, distribute, parallel} nested operations
     if ((llvm::omp::allTargetSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
       genTargetOp(converter, semaCtx, eval, /*genNested=*/false,
-                  currentLocation, loopOpClauseList, ompDirective,
-                  /*outerCombined=*/true);
+                  currentLocation, beginClauseList, /*outerCombined=*/true);
     }
     if ((llvm::omp::allTeamsSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
       genTeamsOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
-                 loopOpClauseList,
-                 /*outerCombined=*/true);
+                 beginClauseList, /*outerCombined=*/true);
     }
     if (llvm::omp::allDistributeSet.test(ompDirective)) {
       validDirective = true;
-      TODO(currentLocation, "Distribute construct");
+      genDistributeOp(converter, semaCtx, eval, /*genNested=*/false,
+                      currentLocation, beginClauseList);
     }
     if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
       genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false,
-                    currentLocation, loopOpClauseList,
-                    /*outerCombined=*/true);
+                    currentLocation, beginClauseList, /*outerCombined=*/true);
     }
   }
   if ((llvm::omp::allDoSet | llvm::omp::allSimdSet).test(ompDirective))
@@ -2190,17 +2249,14 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
 
   if (llvm::omp::allDoSimdSet.test(ompDirective)) {
     // 2.9.3.2 Workshare SIMD construct
-    createSimdWsloop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
-                     endClauseList, currentLocation);
-
+    genCompositeDoSimd(converter, semaCtx, eval, ompDirective, beginClauseList,
+                       endClauseList, currentLocation);
   } else if (llvm::omp::allSimdSet.test(ompDirective)) {
     // 2.9.3.1 SIMD construct
-    createSimdLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
-                   currentLocation);
-    genOpenMPReduction(converter, semaCtx, loopOpClauseList);
+    genSimdLoopOp(converter, semaCtx, eval, currentLocation, beginClauseList);
   } else {
-    createWsloop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
-                 endClauseList, currentLocation);
+    genWsloopOp(converter, semaCtx, eval, currentLocation, beginClauseList,
+                endClauseList);
   }
 }
 
@@ -2220,44 +2276,39 @@ genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPSectionsConstruct &sectionsConstruct) {
-  mlir::Location currentLocation = converter.getCurrentLocation();
-  mlir::omp::SectionsClauseOps clauseOps;
   const auto &beginSectionsDirective =
       std::get<Fortran::parser::OmpBeginSectionsDirective>(sectionsConstruct.t);
-  const auto &sectionsClauseList =
+  const auto &beginClauseList =
       std::get<Fortran::parser::OmpClauseList>(beginSectionsDirective.t);
 
   // Process clauses before optional omp.parallel, so that new variables are
   // allocated outside of the parallel region
-  ClauseProcessor cp(converter, semaCtx, sectionsClauseList);
-  cp.processSectionsReduction(currentLocation, clauseOps);
-  cp.processAllocate(clauseOps);
-  // TODO Support delayed privatization.
+  mlir::Location currentLocation = converter.getCurrentLocation();
+  mlir::omp::SectionsClauseOps clauseOps;
+  genSectionsClauses(converter, semaCtx, beginClauseList, currentLocation,
+                     /*clausesFromBeginSections=*/true, clauseOps);
 
+  // Parallel wrapper of PARALLEL SECTIONS construct
   llvm::omp::Directive dir =
       std::get<Fortran::parser::OmpSectionsDirective>(beginSectionsDirective.t)
           .v;
-
-  // Parallel wrapper of PARALLEL SECTIONS construct
   if (dir == llvm::omp::Directive::OMPD_parallel_sections) {
     genParallelOp(converter, symTable, semaCtx, eval,
-                  /*genNested=*/false, currentLocation, sectionsClauseList,
+                  /*genNested=*/false, currentLocation, beginClauseList,
                   /*outerCombined=*/true);
   } else {
     const auto &endSectionsDirective =
         std::get<Fortran::parser::OmpEndSectionsDirective>(sectionsConstruct.t);
-    const auto &endSectionsClauseList =
+    const auto &endClauseList =
         std::get<Fortran::parser::OmpClauseList>(endSectionsDirective.t);
-    ClauseProcessor(converter, semaCtx, endSectionsClauseList)
-        .processNowait(clauseOps);
+    genSectionsClauses(converter, semaCtx, endClauseList, currentLocation,
+                       /*clausesFromBeginSections=*/false, clauseOps);
   }
 
-  // SECTIONS construct
-  genOpWithBody<mlir::omp::SectionsOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(false),
-      clauseOps);
+  // SECTIONS construct.
+  genSectionsOp(converter, semaCtx, eval, currentLocation, clauseOps);
 
+  // Generate nested SECTION operations recursively.
   const auto &sectionBlocks =
       std::get<Fortran::parser::OmpSectionBlocks>(sectionsConstruct.t);
   auto &firOpBuilder = converter.getFirOpBuilder();
@@ -2266,40 +2317,12 @@ genOMP(Fortran::lower::AbstractConverter &converter,
        llvm::zip(sectionBlocks.v, eval.getNestedEvaluations())) {
     symTable.pushScope();
     genSectionOp(converter, semaCtx, neval, /*genNested=*/true, currentLocation,
-                 sectionsClauseList);
+                 beginClauseList);
     symTable.popScope();
     firOpBuilder.restoreInsertionPoint(ip);
   }
 }
 
-static void
-genOMP(Fortran::lower::AbstractConverter &converter,
-       Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semaCtx,
-       Fortran::lower::pft::Evaluation &eval,
-       const Fortran::parser::OpenMPStandaloneConstruct &standaloneConstruct) {
-  std::visit(
-      Fortran::common::visitors{
-          [&](const Fortran::parser::OpenMPSimpleStandaloneConstruct
-                  &simpleStandaloneConstruct) {
-            genOmpSimpleStandalone(converter, semaCtx, eval,
-                                   /*genNested=*/true,
-                                   simpleStandaloneConstruct);
-          },
-          [&](const Fortran::parser::OpenMPFlushConstruct &flushConstruct) {
-            genOmpFlush(converter, semaCtx, eval, flushConstruct);
-          },
-          [&](const Fortran::parser::OpenMPCancelConstruct &cancelConstruct) {
-            TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct");
-          },
-          [&](const Fortran::parser::OpenMPCancellationPointConstruct
-                  &cancellationPointConstruct) {
-            TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct");
-          },
-      },
-      standaloneConstruct.u);
-}
-
 static void genOMP(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
                    Fortran::semantics::SemanticsContext &semaCtx,
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 8ab74103cb6a80..88710880174d21 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -3993,6 +3993,25 @@ mlir::LogicalResult fir::CUDAKernelOp::verify() {
   return mlir::success();
 }
 
+mlir::LogicalResult fir::CUDAAllocateOp::verify() {
+  if (getPinned() && getStream())
+    return emitOpError("pinned and stream cannot appears at the same time");
+  if (!fir::unwrapRefType(getBox().getType()).isa<fir::BaseBoxType>())
+    return emitOpError(
+        "expect box to be a reference to/or a class or box type value");
+  if (getSource() &&
+      !fir::unwrapRefType(getSource().getType()).isa<fir::BaseBoxType>())
+    return emitOpError(
+        "expect source to be a reference to/or a class or box type value");
+  if (getErrmsg() &&
+      !fir::unwrapRefType(getErrmsg().getType()).isa<fir::BoxType>())
+    return emitOpError(
+        "expect errmsg to be a reference to/or a box type value");
+  if (getErrmsg() && !getHasStat())
+    return emitOpError("expect stat attribute when errmsg is provided");
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // FIROpsDialect
 //===----------------------------------------------------------------------===//
diff --git a/flang/runtime/extensions.cpp b/flang/runtime/extensions.cpp
index 3ac98000335d7d..12498b502ae1cf 100644
--- a/flang/runtime/extensions.cpp
+++ b/flang/runtime/extensions.cpp
@@ -17,6 +17,7 @@
 #include "flang/Runtime/entry-names.h"
 #include "flang/Runtime/io-api.h"
 #include <chrono>
+#include <cstring>
 #include <ctime>
 #include <signal.h>
 #include <thread>
@@ -138,5 +139,77 @@ void RTNAME(Sleep)(std::int64_t seconds) {
   std::this_thread::sleep_for(std::chrono::seconds(seconds));
 }
 
+// TODO: not supported on Windows
+#ifndef _WIN32
+std::int64_t FORTRAN_PROCEDURE_NAME(access)(const char *name,
+    std::int64_t nameLength, const char *mode, std::int64_t modeLength) {
+  std::int64_t ret{-1};
+  if (nameLength <= 0 || modeLength <= 0 || !name || !mode) {
+    return ret;
+  }
+
+  // ensure name is null terminated
+  char *newName{nullptr};
+  if (name[nameLength - 1] != '\0') {
+    newName = static_cast<char *>(std::malloc(nameLength + 1));
+    std::memcpy(newName, name, nameLength);
+    newName[nameLength] = '\0';
+    name = newName;
+  }
+
+  // calculate mode
+  bool read{false};
+  bool write{false};
+  bool execute{false};
+  bool exists{false};
+  int imode{0};
+
+  for (std::int64_t i = 0; i < modeLength; ++i) {
+    switch (mode[i]) {
+    case 'r':
+      read = true;
+      break;
+    case 'w':
+      write = true;
+      break;
+    case 'x':
+      execute = true;
+      break;
+    case ' ':
+      exists = true;
+      break;
+    default:
+      // invalid mode
+      goto cleanup;
+    }
+  }
+  if (!read && !write && !execute && !exists) {
+    // invalid mode
+    goto cleanup;
+  }
+
+  if (!read && !write && !execute) {
+    imode = F_OK;
+  } else {
+    if (read) {
+      imode |= R_OK;
+    }
+    if (write) {
+      imode |= W_OK;
+    }
+    if (execute) {
+      imode |= X_OK;
+    }
+  }
+  ret = access(name, imode);
+
+cleanup:
+  if (newName) {
+    free(newName);
+  }
+  return ret;
+}
+#endif
+
 } // namespace Fortran::runtime
 } // extern "C"
diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir
new file mode 100644
index 00000000000000..9c5ffe7176a3bd
--- /dev/null
+++ b/flang/test/Fir/cuf-invalid.fir
@@ -0,0 +1,50 @@
+// RUN: fir-opt -split-input-file -verify-diagnostics %s
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %1 = fir.alloca i32
+  %pinned = fir.alloca i1
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %s = fir.load %1 : !fir.ref<i32>
+  // expected-error@+1{{'fir.cuda_allocate' op pinned and stream cannot appears at the same time}}
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) pinned(%pinned : !fir.ref<i1>) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// -----
+
+func.func @_QPsub1() {
+  %1 = fir.alloca i32
+  // expected-error@+1{{'fir.cuda_allocate' op expect box to be a reference to/or a class or box type value}}
+  %2 = fir.cuda_allocate %1 : !fir.ref<i32> {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %c100 = arith.constant 100 : index
+  %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"}
+  %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref<!fir.char<1,100>>, index) -> (!fir.ref<!fir.char<1,100>>, !fir.ref<!fir.char<1,100>>)
+  %9 = fir.embox %8#1 : (!fir.ref<!fir.char<1,100>>) -> !fir.box<!fir.char<1,100>>
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %16 = fir.convert %9 : (!fir.box<!fir.char<1,100>>) -> !fir.box<none>
+  // expected-error@+1{{'fir.cuda_allocate' op expect stat attribute when errmsg is provided}}
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %1 = fir.alloca i32
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  // expected-error@+1{{'fir.cuda_allocate' op expect errmsg to be a reference to/or a box type value}}
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> errmsg(%1 : !fir.ref<i32>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+  return
+}
diff --git a/flang/test/Fir/cuf.mlir b/flang/test/Fir/cuf.mlir
new file mode 100644
index 00000000000000..67eff31b35b2b8
--- /dev/null
+++ b/flang/test/Fir/cuf.mlir
@@ -0,0 +1,70 @@
+// RUN: fir-opt --split-input-file %s | fir-opt --split-input-file | FileCheck %s
+
+// Simple round trip test of operations.
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> {cuda_attr = #fir.cuda<device>} -> i32
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %1 = fir.alloca i32
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %s = fir.load %1 : !fir.ref<i32>
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> stream(%{{.*}} : i32) {cuda_attr = #fir.cuda<device>} -> i32
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %1 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub1Eb"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %5:2 = hlfir.declare %1 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %12 = fir.convert %5#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> source(%12 : !fir.ref<!fir.box<none>>) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> source(%{{.*}} : !fir.ref<!fir.box<none>>) {cuda_attr = #fir.cuda<device>} -> i32
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %pinned = fir.alloca i1
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> pinned(%pinned : !fir.ref<i1>) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> pinned(%{{.*}} : !fir.ref<i1>) {cuda_attr = #fir.cuda<device>} -> i32
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %c100 = arith.constant 100 : index
+  %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"}
+  %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref<!fir.char<1,100>>, index) -> (!fir.ref<!fir.char<1,100>>, !fir.ref<!fir.char<1,100>>)
+  %9 = fir.embox %8#1 : (!fir.ref<!fir.char<1,100>>) -> !fir.box<!fir.char<1,100>>
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %16 = fir.convert %9 : (!fir.box<!fir.char<1,100>>) -> !fir.box<none>
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+  return
+}
diff --git a/flang/test/Lower/OpenMP/FIR/target.f90 b/flang/test/Lower/OpenMP/FIR/target.f90
index 022327f9c25daf..ca3162340d7846 100644
--- a/flang/test/Lower/OpenMP/FIR/target.f90
+++ b/flang/test/Lower/OpenMP/FIR/target.f90
@@ -411,8 +411,8 @@ end subroutine omp_target_implicit_bounds
 !CHECK-LABEL: func.func @_QPomp_target_thread_limit() {
 subroutine omp_target_thread_limit
    integer :: a
-   !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
+   !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
    !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref<i32>) {
    !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<i32>):
    !$omp target map(tofrom: a) thread_limit(64)
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 6f72b5a34d069a..51b66327dfb24b 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -490,8 +490,8 @@ end subroutine omp_target_implicit_bounds
 !CHECK-LABEL: func.func @_QPomp_target_thread_limit() {
 subroutine omp_target_thread_limit
    integer :: a
-   !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
+   !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
    !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %{{.*}} : !fir.ref<i32>) {
    !CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
    !$omp target map(tofrom: a) thread_limit(64)
diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
index 33b5971656010a..d849dd206b9439 100644
--- a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
+++ b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
@@ -21,7 +21,7 @@ subroutine only_use_device_ptr
 
 !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr()
 !CHECK: omp.target_data use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
 subroutine mix_use_device_ptr_and_addr 
     use iso_c_binding
     integer, pointer, dimension(:) :: array
@@ -47,7 +47,7 @@ subroutine only_use_device_addr
 
 !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map()
 !CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
 subroutine mix_use_device_ptr_and_addr_and_map
     use iso_c_binding
     integer :: i, j
diff --git a/flang/unittests/Runtime/AccessTest.cpp b/flang/unittests/Runtime/AccessTest.cpp
new file mode 100644
index 00000000000000..66f19f78c7cfb6
--- /dev/null
+++ b/flang/unittests/Runtime/AccessTest.cpp
@@ -0,0 +1,422 @@
+//===-- flang/unittests/Runtime/AccessTest.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: ACCESS is not yet implemented on Windows
+#ifndef _WIN32
+
+#include "CrashHandlerFixture.h"
+#include "gtest/gtest.h"
+#include "flang/Runtime/extensions.h"
+#include "llvm/ADT/Twine.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace {
+
+struct AccessTests : public CrashHandlerFixture {};
+
+struct AccessType {
+  bool read{false};
+  bool write{false};
+  bool execute{false};
+  bool exists{false};
+};
+
+} // namespace
+
+static std::string addPIDSuffix(const char *name) {
+  std::stringstream ss;
+  ss << name;
+  ss << '.';
+
+  ss << getpid();
+
+  return ss.str();
+}
+
+static bool exists(const std::string &path) {
+  return access(path.c_str(), F_OK) == 0;
+}
+
+// Implementation of std::filesystem::temp_directory_path adapted from libcxx
+// See llvm-project/libcxx/src/filesystem/operations.cpp
+// Using std::filesystem is inconvenient because the required flags are not
+// consistent accross compilers and CMake doesn't have built in support to
+// determine the correct flags.
+static const char *temp_directory_path() {
+  // TODO: Windows
+  const char *env_paths[] = {"TMPDIR", "TMP", "TEMP", "TEMPDIR"};
+  const char *ret = nullptr;
+
+  for (auto &ep : env_paths) {
+    if ((ret = getenv(ep))) {
+      break;
+    }
+  }
+
+  if (ret == nullptr) {
+#if defined(__ANDROID__)
+    ret = "/data/local/tmp";
+#else
+    ret = "/tmp";
+#endif
+  }
+
+  assert(exists(ret));
+  return ret;
+}
+
+static std::string createTemporaryFile(
+    const char *name, const AccessType &accessType) {
+  std::string path =
+      (llvm::Twine{temp_directory_path()} + "/" + addPIDSuffix(name)).str();
+
+  // O_CREAT | O_EXCL enforces that this file is newly created by this call.
+  // This feels risky. If we don't have permission to create files in the
+  // temporary directory or if the files already exist, the test will fail.
+  // But we can't use std::tmpfile() because we need a path to the file and
+  // to control the filesystem permissions
+  mode_t mode{0};
+  if (accessType.read) {
+    mode |= S_IRUSR;
+  }
+  if (accessType.write) {
+    mode |= S_IWUSR;
+  }
+  if (accessType.execute) {
+    mode |= S_IXUSR;
+  }
+
+  int file = open(path.c_str(), O_CREAT | O_EXCL, mode);
+  if (file == -1) {
+    return {};
+  }
+
+  close(file);
+
+  return path;
+}
+
+static std::int64_t callAccess(
+    const std::string &path, const AccessType &accessType) {
+  const char *cpath{path.c_str()};
+  std::int64_t pathlen = std::strlen(cpath);
+
+  std::string mode;
+  if (accessType.read) {
+    mode += 'r';
+  }
+  if (accessType.write) {
+    mode += 'w';
+  }
+  if (accessType.execute) {
+    mode += 'x';
+  }
+  if (accessType.exists) {
+    mode += ' ';
+  }
+
+  const char *cmode = mode.c_str();
+  std::int64_t modelen = std::strlen(cmode);
+
+  return FORTRAN_PROCEDURE_NAME(access)(cpath, pathlen, cmode, modelen);
+}
+
+TEST(AccessTests, TestExists) {
+  AccessType accessType;
+  accessType.exists = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotExists) {
+  std::string nonExistant{addPIDSuffix(__func__)};
+  ASSERT_FALSE(exists(nonExistant));
+
+  AccessType accessType;
+  accessType.exists = true;
+  std::int64_t res = callAccess(nonExistant, accessType);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestRead) {
+  AccessType accessType;
+  accessType.read = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotRead) {
+  AccessType accessType;
+  accessType.read = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestWrite) {
+  AccessType accessType;
+  accessType.write = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotWrite) {
+  AccessType accessType;
+  accessType.write = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.write = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestReadWrite) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotReadWrite0) {
+  AccessType accessType;
+  accessType.read = false;
+  accessType.write = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotReadWrite1) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotReadWrite2) {
+  AccessType accessType;
+  accessType.read = false;
+  accessType.write = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestExecute) {
+  AccessType accessType;
+  accessType.execute = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotExecute) {
+  AccessType accessType;
+  accessType.execute = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestRWX) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotRWX0) {
+  AccessType accessType;
+  accessType.read = false;
+  accessType.write = false;
+  accessType.execute = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotRWX1) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = false;
+  accessType.execute = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotRWX2) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotRWX3) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = false;
+  accessType.execute = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotRWX4) {
+  AccessType accessType;
+  accessType.read = false;
+  accessType.write = true;
+  accessType.execute = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+#endif // !_WIN32
diff --git a/flang/unittests/Runtime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt
index 23f02aa751246b..f7caacad3a598f 100644
--- a/flang/unittests/Runtime/CMakeLists.txt
+++ b/flang/unittests/Runtime/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_flang_unittest(FlangRuntimeTests
+  AccessTest.cpp
   Allocatable.cpp
   ArrayConstructor.cpp
   BufferTest.cpp
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
index be5e54f3fa5c85..3755d288047e0b 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
@@ -34,7 +34,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Backend, class _Index, class _Brick>
 _LIBCPP_HIDE_FROM_ABI optional<bool> __parallel_or(_Index __first, _Index __last, _Brick __f) {
   std::atomic<bool> __found(false);
-  auto __ret = __pstl::__cpu_traits<_Backend>::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
+  auto __ret = __pstl::__cpu_traits<_Backend>::__for_each(__first, __last, [__f, &__found](_Index __i, _Index __j) {
     if (!__found.load(std::memory_order_relaxed) && __f(__i, __j)) {
       __found.store(true, std::memory_order_relaxed);
       __pstl::__cpu_traits<_Backend>::__cancel_execution();
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
index 49a32f6c5ce551..0c20bdff62675a 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
         __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           [[maybe_unused]] auto __res = std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __value);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
index 11a5668bf25af1..626293faef6921 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
@@ -42,7 +42,7 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
   _DifferenceType __initial_dist = __b_first ? __n : -1;
   std::atomic<_DifferenceType> __extremum(__initial_dist);
   // TODO: find out what is better here: parallel_for or parallel_reduce
-  auto __res = __pstl::__cpu_traits<_Backend>::__parallel_for(
+  auto __res = __pstl::__cpu_traits<_Backend>::__for_each(
       __first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
         // See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of
         // why using a shared variable scales fairly well in this situation.
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index 1667ec0f0c4f41..d637084e151d81 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
         __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           [[maybe_unused]] auto __res = std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __func);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
index 8757f249680375..17faadf55dd4fa 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
@@ -85,7 +85,7 @@ template <>
 struct __cpu_traits<__libdispatch_backend_tag> {
   template <class _RandomAccessIterator, class _Functor>
   _LIBCPP_HIDE_FROM_ABI static optional<__empty>
-  __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
+  __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
     return __libdispatch::__dispatch_parallel_for(
         __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
   }
@@ -105,14 +105,14 @@ struct __cpu_traits<__libdispatch_backend_tag> {
             typename _RandomAccessIterator3,
             typename _Compare,
             typename _LeafMerge>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
-      _RandomAccessIterator1 __first1,
-      _RandomAccessIterator1 __last1,
-      _RandomAccessIterator2 __first2,
-      _RandomAccessIterator2 __last2,
-      _RandomAccessIterator3 __result,
-      _Compare __comp,
-      _LeafMerge __leaf_merge) noexcept {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __merge(_RandomAccessIterator1 __first1,
+          _RandomAccessIterator1 __last1,
+          _RandomAccessIterator2 __first2,
+          _RandomAccessIterator2 __last2,
+          _RandomAccessIterator3 __result,
+          _Compare __comp,
+          _LeafMerge __leaf_merge) noexcept {
     __libdispatch::__chunk_partitions __partitions =
         __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
 
@@ -201,7 +201,7 @@ struct __cpu_traits<__libdispatch_backend_tag> {
   }
 
   template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
-  _LIBCPP_HIDE_FROM_ABI static optional<_Value> __parallel_transform_reduce(
+  _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce(
       _RandomAccessIterator __first,
       _RandomAccessIterator __last,
       _Transform __transform,
@@ -248,8 +248,8 @@ struct __cpu_traits<__libdispatch_backend_tag> {
   }
 
   template <class _RandomAccessIterator, class _Comp, class _LeafSort>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
-      _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
     const auto __size = __last - __first;
     auto __partitions = __libdispatch::__partition_chunks(__size);
 
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
index d034447904872e..c93f4051c9d094 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
@@ -46,7 +46,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_merge(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_merge(
+    auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__merge(
         __first1,
         __last1,
         __first2,
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
index c3d2905daed170..7544619a8eefd8 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
@@ -35,20 +35,20 @@ template <>
 struct __cpu_traits<__serial_backend_tag> {
   template <class _RandomAccessIterator, class _Fp>
   _LIBCPP_HIDE_FROM_ABI static optional<__empty>
-  __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
+  __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
     __f(__first, __last);
     return __empty{};
   }
 
   template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
   _LIBCPP_HIDE_FROM_ABI static optional<_Tp>
-  __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
+  __transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
     return __reduce(std::move(__first), std::move(__last), std::move(__init));
   }
 
   template <class _RandomAccessIterator, class _Compare, class _LeafSort>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
-      _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
     __leaf_sort(__first, __last, __comp);
     return __empty{};
   }
@@ -60,14 +60,14 @@ struct __cpu_traits<__serial_backend_tag> {
             class _RandomAccessIterator3,
             class _Compare,
             class _LeafMerge>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
-      _RandomAccessIterator1 __first1,
-      _RandomAccessIterator1 __last1,
-      _RandomAccessIterator2 __first2,
-      _RandomAccessIterator2 __last2,
-      _RandomAccessIterator3 __outit,
-      _Compare __comp,
-      _LeafMerge __leaf_merge) {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __merge(_RandomAccessIterator1 __first1,
+          _RandomAccessIterator1 __last1,
+          _RandomAccessIterator2 __first2,
+          _RandomAccessIterator2 __last2,
+          _RandomAccessIterator3 __outit,
+          _Compare __comp,
+          _LeafMerge __leaf_merge) {
     __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
     return __empty{};
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
index ebfa0fc69147d5..8c60cf897ff860 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
@@ -29,7 +29,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_stable_sort(__cpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy>) {
-    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_stable_sort(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__stable_sort(
         __first, __last, __comp, [](_RandomAccessIterator __g_first, _RandomAccessIterator __g_last, _Comp __g_comp) {
           std::stable_sort(__g_first, __g_last, __g_comp);
         });
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
index 8d1cb221c3d82a..2acf912264a001 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
@@ -38,20 +38,20 @@ template <>
 struct __cpu_traits<__std_thread_backend_tag> {
   template <class _RandomAccessIterator, class _Fp>
   _LIBCPP_HIDE_FROM_ABI static optional<__empty>
-  __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
+  __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
     __f(__first, __last);
     return __empty{};
   }
 
   template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
   _LIBCPP_HIDE_FROM_ABI static optional<_Tp>
-  __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
+  __transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
     return __reduce(std::move(__first), std::move(__last), std::move(__init));
   }
 
   template <class _RandomAccessIterator, class _Compare, class _LeafSort>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
-      _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
     __leaf_sort(__first, __last, __comp);
     return __empty{};
   }
@@ -63,14 +63,14 @@ struct __cpu_traits<__std_thread_backend_tag> {
             class _RandomAccessIterator3,
             class _Compare,
             class _LeafMerge>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
-      _RandomAccessIterator1 __first1,
-      _RandomAccessIterator1 __last1,
-      _RandomAccessIterator2 __first2,
-      _RandomAccessIterator2 __last2,
-      _RandomAccessIterator3 __outit,
-      _Compare __comp,
-      _LeafMerge __leaf_merge) {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __merge(_RandomAccessIterator1 __first1,
+          _RandomAccessIterator1 __last1,
+          _RandomAccessIterator2 __first2,
+          _RandomAccessIterator2 __last2,
+          _RandomAccessIterator3 __outit,
+          _Compare __comp,
+          _LeafMerge __leaf_merge) {
     __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
     return __empty{};
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index d4c383997a67a9..4b9b2968668327 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -50,7 +50,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
+    __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
         __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           auto __res = std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
@@ -98,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
+    auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
         __first1,
         __last1,
         [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index 956c7d6a88ce29..c074eea9861c1b 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -120,7 +120,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
-    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce(
         __first1,
         std::move(__last1),
         [__first1, __first2, __transform](_ForwardIterator1 __iter) {
@@ -167,7 +167,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     _UnaryOperation __transform) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce(
         std::move(__first),
         std::move(__last),
         [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
index 2f0db46e9be83a..0483d6918fd01d 100644
--- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h
+++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
@@ -32,31 +32,30 @@ namespace __pstl {
 // ================
 //
 //  template <class _RandomAccessIterator, class _Functor>
-//  optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
+//  optional<__empty> __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
 //    - __func must take a subrange of [__first, __last) that should be executed in serial
 //
 //  template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
-//  optional<_Tp> __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp,
-//  _Reduction);
+//  optional<_Tp> __transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
 //
 //  template <class _RandomAccessIterator1,
 //            class _RandomAccessIterator2,
 //            class _RandomAccessIterator3,
 //            class _Compare,
 //            class _LeafMerge>
-//  optional<_RandomAccessIterator3> __parallel_merge(_RandomAccessIterator1 __first1,
-//                                                    _RandomAccessIterator1 __last1,
-//                                                    _RandomAccessIterator2 __first2,
-//                                                    _RandomAccessIterator2 __last2,
-//                                                    _RandomAccessIterator3 __outit,
-//                                                    _Compare __comp,
-//                                                    _LeafMerge __leaf_merge);
+//  optional<_RandomAccessIterator3> __merge(_RandomAccessIterator1 __first1,
+//                                           _RandomAccessIterator1 __last1,
+//                                           _RandomAccessIterator2 __first2,
+//                                           _RandomAccessIterator2 __last2,
+//                                           _RandomAccessIterator3 __outit,
+//                                           _Compare __comp,
+//                                           _LeafMerge __leaf_merge);
 //
 //  template <class _RandomAccessIterator, class _Comp, class _LeafSort>
-//  optional<__empty> __parallel_stable_sort(_RandomAccessIterator __first,
-//                                           _RandomAccessIterator __last,
-//                                           _Comp __comp,
-//                                           _LeafSort __leaf_sort);
+//  optional<__empty> __stable_sort(_RandomAccessIterator __first,
+//                                  _RandomAccessIterator __last,
+//                                  _Comp __comp,
+//                                  _LeafSort __leaf_sort);
 //
 //   void __cancel_execution();
 //      Cancel the execution of other jobs - they aren't needed anymore. This is not a binding request,
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 9c20bbb83d86d1..7269d156752de8 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -2072,8 +2072,16 @@ void Writer::createRuntimePseudoRelocs() {
     return;
   }
 
-  if (!rels.empty())
+  if (!rels.empty()) {
     log("Writing " + Twine(rels.size()) + " runtime pseudo relocations");
+    const char *symbolName = "_pei386_runtime_relocator";
+    Symbol *relocator = ctx.symtab.findUnderscore(symbolName);
+    if (!relocator)
+      error("output image has runtime pseudo relocations, but the function " +
+            Twine(symbolName) +
+            " is missing; it is needed for fixing the relocations at runtime");
+  }
+
   PseudoRelocTableChunk *table = make<PseudoRelocTableChunk>(rels);
   rdataSec->addChunk(table);
   EmptyChunk *endOfList = make<EmptyChunk>();
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index c8350652e65a67..fa48552b8f7a12 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -464,7 +464,11 @@ void InputSection::copyRelocations(uint8_t *buf,
         addend += sec->getFile<ELFT>()->mipsGp0;
       }
 
-      if (RelTy::IsRela)
+      if (config->emachine == EM_LOONGARCH && type == R_LARCH_ALIGN)
+        // LoongArch psABI v2.30, the R_LARCH_ALIGN requires symbol index.
+        // If it use the section symbol, the addend should not be changed.
+        p->r_addend = addend;
+      else if (RelTy::IsRela)
         p->r_addend = sym.getVA(addend) - section->getOutputSection()->addr;
       // For SHF_ALLOC sections relocated by REL, append a relocation to
       // sec->relocations so that relocateAlloc transitively called by
diff --git a/lld/test/COFF/autoimport-arm-data.s b/lld/test/COFF/autoimport-arm-data.s
index 74604aa5c82343..82c66f0989d490 100644
--- a/lld/test/COFF/autoimport-arm-data.s
+++ b/lld/test/COFF/autoimport-arm-data.s
@@ -33,6 +33,9 @@
     .text
     .thumb
 main:
+    bx lr
+    .global _pei386_runtime_relocator
+_pei386_runtime_relocator:
     bx lr
     .data
 ptr:
diff --git a/lld/test/COFF/autoimport-arm64-data.s b/lld/test/COFF/autoimport-arm64-data.s
index fa3654be3a71d7..b49bd4f89c97c2 100644
--- a/lld/test/COFF/autoimport-arm64-data.s
+++ b/lld/test/COFF/autoimport-arm64-data.s
@@ -33,6 +33,9 @@
     .global main
     .text
 main:
+    ret
+    .global _pei386_runtime_relocator
+_pei386_runtime_relocator:
     ret
     .data
 ptr:
diff --git a/lld/test/COFF/autoimport-gnu-implib.s b/lld/test/COFF/autoimport-gnu-implib.s
index d7d4ed626e83ae..d9dc9d7a38fdc3 100644
--- a/lld/test/COFF/autoimport-gnu-implib.s
+++ b/lld/test/COFF/autoimport-gnu-implib.s
@@ -27,5 +27,8 @@
     .text
 main:
     movl data(%rip), %eax
+    ret
+    .global _pei386_runtime_relocator
+_pei386_runtime_relocator:
     ret
     .data
diff --git a/lld/test/COFF/autoimport-handler-func.s b/lld/test/COFF/autoimport-handler-func.s
new file mode 100644
index 00000000000000..02d040bfa274ce
--- /dev/null
+++ b/lld/test/COFF/autoimport-handler-func.s
@@ -0,0 +1,36 @@
+# REQUIRES: x86
+# RUN: split-file %s %t.dir
+
+# RUN: llvm-dlltool -m i386:x86-64 -d %t.dir/lib.def -D lib.dll -l %t.dir/lib.lib
+
+# RUN: llvm-mc -triple=x86_64-windows-gnu %t.dir/main.s -filetype=obj -o %t.dir/main.obj
+# RUN: llvm-mc -triple=x86_64-windows-gnu %t.dir/func.s -filetype=obj -o %t.dir/func.obj
+# RUN: env LLD_IN_TEST=1 not lld-link -lldmingw -out:%t.dir/main.exe -entry:main %t.dir/main.obj %t.dir/lib.lib 2>&1 | FileCheck %s --check-prefix=ERR
+
+# RUN: lld-link -lldmingw -out:%t.dir/main.exe -entry:main %t.dir/main.obj %t.dir/func.obj %t.dir/lib.lib 2>&1 | FileCheck %s --check-prefix=NOERR --allow-empty
+
+# ERR: error: output image has runtime pseudo relocations, but the function _pei386_runtime_relocator is missing; it is needed for fixing the relocations at runtime
+
+# NOERR-NOT: error
+
+#--- main.s
+    .global main
+    .text
+main:
+    ret
+
+    .data
+    .long 1
+    .quad variable
+    .long 2
+
+#--- func.s
+    .global _pei386_runtime_relocator
+    .text
+_pei386_runtime_relocator:
+    ret
+
+#--- lib.def
+EXPORTS
+variable DATA
+
diff --git a/lld/test/COFF/autoimport-warn.s b/lld/test/COFF/autoimport-warn.s
index 9c363ed30f2459..eead0fed861f8f 100644
--- a/lld/test/COFF/autoimport-warn.s
+++ b/lld/test/COFF/autoimport-warn.s
@@ -18,6 +18,9 @@ main:
     movl variable2(%rip), %ecx
     addl %ecx, %eax
     ret
+    .global _pei386_runtime_relocator
+_pei386_runtime_relocator:
+    ret
 
     .section .rdata$.refptr.variable1,"dr",discard,.refptr.variable1
     .global .refptr.variable1
diff --git a/lld/test/COFF/autoimport-x86.s b/lld/test/COFF/autoimport-x86.s
index fa36f10e9ca912..5d7c9c2c3fa580 100644
--- a/lld/test/COFF/autoimport-x86.s
+++ b/lld/test/COFF/autoimport-x86.s
@@ -55,6 +55,9 @@
     .text
 main:
     movl variable(%rip), %eax
+    ret
+    .global _pei386_runtime_relocator
+_pei386_runtime_relocator:
     ret
     .data
 ptr:
diff --git a/lld/test/ELF/loongarch-relax-align-ldr.s b/lld/test/ELF/loongarch-relax-align-ldr.s
new file mode 100644
index 00000000000000..6534dc906cfd02
--- /dev/null
+++ b/lld/test/ELF/loongarch-relax-align-ldr.s
@@ -0,0 +1,28 @@
+# REQUIRES: loongarch
+## Test `ld -r` not changes the addend of R_LARCH_ALIGN.
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.64.o
+# RUN: ld.lld -r %t.64.o %t.64.o -o %t.64.r
+# RUN: llvm-objdump -dr --no-show-raw-insn %t.64.r | FileCheck %s
+
+# CHECK:      <.text>:
+# CHECK-NEXT:   break 1
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   {{0*}}04:  R_LARCH_ALIGN        .text+0x804
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   break 2
+# CHECK-NEXT:   break 0
+# CHECK-NEXT:   break 0
+# CHECK-NEXT:   break 0
+# CHECK-NEXT:   break 1
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   {{0*}}24:  R_LARCH_ALIGN        .text+0x804
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   break 2
+
+.text
+break 1
+.p2align 4, , 8
+break 2
diff --git a/lld/test/ELF/loongarch-relax-emit-relocs.s b/lld/test/ELF/loongarch-relax-emit-relocs.s
index 581fce8c95caa4..9007f8fcc114f0 100644
--- a/lld/test/ELF/loongarch-relax-emit-relocs.s
+++ b/lld/test/ELF/loongarch-relax-emit-relocs.s
@@ -25,7 +25,7 @@
 # CHECK-NEXT:     R_LARCH_PCALA_LO12 _start
 # CHECK-NEXT:     R_LARCH_RELAX *ABS*
 # CHECK-NEXT:   nop
-# CHECK-NEXT:     R_LARCH_ALIGN .Lla-relax-align0+0x4
+# CHECK-NEXT:     R_LARCH_ALIGN .text+0x4
 # CHECK-NEXT:   nop
 # CHECK-NEXT:   ret
 
@@ -37,11 +37,12 @@
 # CHECKR-NEXT:     R_LARCH_PCALA_LO12 _start
 # CHECKR-NEXT:     R_LARCH_RELAX *ABS*
 # CHECKR-NEXT:   nop
-# CHECKR-NEXT:     R_LARCH_ALIGN .Lla-relax-align0+0x4
+# CHECKR-NEXT:     R_LARCH_ALIGN .text+0x4
 # CHECKR-NEXT:   nop
 # CHECKR-NEXT:   nop
 # CHECKR-NEXT:   ret
 
+.text
 .global _start
 _start:
   la.pcrel $a0, _start
diff --git a/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp b/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp
index 2032c5a68d054c..6bfaa54135a959 100644
--- a/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp
@@ -909,6 +909,9 @@ bool x86AssemblyInspectionEngine::GetNonCallSiteUnwindPlanFromAssembly(
   if (!m_register_map_initialized)
     return false;
 
+  if (m_disasm_context == nullptr)
+    return false;
+
   addr_t current_func_text_offset = 0;
   int current_sp_bytes_offset_from_fa = 0;
   bool is_aligned = false;
@@ -1570,6 +1573,9 @@ bool x86AssemblyInspectionEngine::FindFirstNonPrologueInstruction(
   if (!m_register_map_initialized)
     return false;
 
+  if (m_disasm_context == nullptr)
+    return false;
+
   while (offset < size) {
     int regno;
     int insn_len;
diff --git a/lldb/test/API/functionalities/asan/Makefile b/lldb/test/API/functionalities/asan/Makefile
index 4913a18d8cc6f9..d66696fed7078f 100644
--- a/lldb/test/API/functionalities/asan/Makefile
+++ b/lldb/test/API/functionalities/asan/Makefile
@@ -1,4 +1,8 @@
 C_SOURCES := main.c
-CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info
+asan: CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info
+asan: all
+
+libsanitizers: CFLAGS_EXTRAS := -fsanitize=address -fsanitize-stable-abi -g -gcolumn-info
+libsanitizers: all
 
 include Makefile.rules
diff --git a/lldb/test/API/functionalities/asan/TestMemoryHistory.py b/lldb/test/API/functionalities/asan/TestMemoryHistory.py
index 00162ae8822c74..ee7939203ead18 100644
--- a/lldb/test/API/functionalities/asan/TestMemoryHistory.py
+++ b/lldb/test/API/functionalities/asan/TestMemoryHistory.py
@@ -9,15 +9,21 @@
 from lldbsuite.test import lldbplatform
 from lldbsuite.test import lldbutil
 
+from functionalities.libsanitizers.util import no_libsanitizers
 
 class AsanTestCase(TestBase):
     @skipIfFreeBSD  # llvm.org/pr21136 runtimes not yet available by default
     @expectedFailureNetBSD
     @skipUnlessAddressSanitizer
     def test(self):
-        self.build()
+        self.build(make_targets=["asan"])
         self.asan_tests()
 
+    @skipIf(oslist=no_match(["macosx"]))
+    def test_libsanitizers_asan(self):
+        self.build(make_targets=["libsanitizers"])
+        self.libsanitizer_tests()
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
@@ -26,6 +32,71 @@ def setUp(self):
         self.line_free = line_number("main.c", "// free line")
         self.line_breakpoint = line_number("main.c", "// break line")
 
+    # Test line numbers: rdar://126237493
+    def libsanitizer_tests(self):
+        target = self.createTestTarget()
+
+        if no_libsanitizers(self):
+            self.skipTest("libsanitizers not found")
+
+        self.runCmd(
+            "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0"
+        )
+
+        self.runCmd("run")
+
+        # In libsanitizers, memory history is not supported until a report has been generated
+        self.expect(
+            "thread list",
+            "Process should be stopped due to ASan report",
+            substrs=["stopped", "stop reason = Use of deallocated memory"],
+        )
+
+        # test the 'memory history' command
+        self.expect(
+            "memory history 'pointer'",
+            substrs=[
+                "Memory deallocated by Thread",
+                "a.out`f2",
+                "main.c",
+                "Memory allocated by Thread",
+                "a.out`f1",
+                "main.c",
+            ],
+        )
+
+        # do the same using SB API
+        process = self.dbg.GetSelectedTarget().process
+        val = (
+            process.GetSelectedThread().GetSelectedFrame().EvaluateExpression("pointer")
+        )
+        addr = val.GetValueAsUnsigned()
+        threads = process.GetHistoryThreads(addr)
+        self.assertEqual(threads.GetSize(), 2)
+
+        history_thread = threads.GetThreadAtIndex(0)
+        self.assertTrue(history_thread.num_frames >= 2)
+        self.assertEqual(
+            history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(),
+            "main.c",
+        )
+
+        history_thread = threads.GetThreadAtIndex(1)
+        self.assertTrue(history_thread.num_frames >= 2)
+        self.assertEqual(
+            history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(),
+            "main.c",
+        )
+
+        # let's free the container (SBThreadCollection) and see if the
+        # SBThreads still live
+        threads = None
+        self.assertTrue(history_thread.num_frames >= 2)
+        self.assertEqual(
+            history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(),
+            "main.c",
+        )
+
     def asan_tests(self):
         target = self.createTestTarget()
 
diff --git a/lldb/test/API/functionalities/asan/TestReportData.py b/lldb/test/API/functionalities/asan/TestReportData.py
index 543c5fe66a208d..de0c1206a57ad6 100644
--- a/lldb/test/API/functionalities/asan/TestReportData.py
+++ b/lldb/test/API/functionalities/asan/TestReportData.py
@@ -9,6 +9,7 @@
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
+from functionalities.libsanitizers.util import no_libsanitizers
 
 class AsanTestReportDataCase(TestBase):
     @skipIfFreeBSD  # llvm.org/pr21136 runtimes not yet available by default
@@ -16,9 +17,14 @@ class AsanTestReportDataCase(TestBase):
     @skipUnlessAddressSanitizer
     @skipIf(archs=["i386"], bugnumber="llvm.org/PR36710")
     def test(self):
-        self.build()
+        self.build(make_targets=["asan"])
         self.asan_tests()
 
+    @skipIf(oslist=no_match(["macosx"]))
+    def test_libsanitizers_asan(self):
+        self.build(make_targets=["libsanitizers"])
+        self.asan_tests(libsanitizers=True)
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
@@ -29,10 +35,18 @@ def setUp(self):
         self.line_crash = line_number("main.c", "// BOOM line")
         self.col_crash = 16
 
-    def asan_tests(self):
+    def asan_tests(self, libsanitizers=False):
         target = self.createTestTarget()
 
-        self.registerSanitizerLibrariesWithTarget(target)
+        if libsanitizers and no_libsanitizers(self):
+            self.skipTest("libsanitizers not found")
+
+        if libsanitizers:
+            self.runCmd(
+                "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0"
+            )
+        else:
+            self.registerSanitizerLibrariesWithTarget(target)
 
         self.runCmd("run")
 
diff --git a/lldb/test/API/functionalities/libsanitizers/util.py b/lldb/test/API/functionalities/libsanitizers/util.py
new file mode 100644
index 00000000000000..ad68541aba8d05
--- /dev/null
+++ b/lldb/test/API/functionalities/libsanitizers/util.py
@@ -0,0 +1,3 @@
+def no_libsanitizers(testbase):
+    testbase.runCmd("image list libsystem_sanitizers.dylib", check=False)
+    return not "libsystem_sanitizers.dylib" in testbase.res.GetOutput()
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index 290569576ac80d..e24f3fbb4d9318 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -50,10 +50,14 @@
 )
 
 # Enable sanitizer runtime flags.
-config.environment["ASAN_OPTIONS"] = "detect_stack_use_after_return=1"
-config.environment["TSAN_OPTIONS"] = "halt_on_error=1"
-if platform.system() == "Darwin":
-    config.environment["MallocNanoZone"] = "0"
+if "Address" in config.llvm_use_sanitizer:
+    config.environment["ASAN_OPTIONS"] = "detect_stack_use_after_return=1"
+    if platform.system() == "Darwin":
+        config.environment["MallocNanoZone"] = "0"
+
+if "Thread" in config.llvm_use_sanitizer:
+    config.environment["TSAN_OPTIONS"] = "halt_on_error=1"
+
 
 # Support running the test suite under the lldb-repro wrapper. This makes it
 # possible to capture a test suite run and then rerun all the test from the
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index 736dfc335732b5..b69e7bce1bc0be 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -26,6 +26,7 @@ config.lldb_enable_lua = @LLDB_ENABLE_LUA@
 config.lldb_build_directory = "@LLDB_TEST_BUILD_DIRECTORY@"
 config.have_lldb_server = @LLDB_TOOL_LLDB_SERVER_BUILD@
 config.lldb_system_debugserver = @LLDB_USE_SYSTEM_DEBUGSERVER@
+config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 # The shell tests use their own module caches.
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
diff --git a/lldb/unittests/UnwindAssembly/CMakeLists.txt b/lldb/unittests/UnwindAssembly/CMakeLists.txt
index 136fcd9ae97981..d6e4471af4ecb3 100644
--- a/lldb/unittests/UnwindAssembly/CMakeLists.txt
+++ b/lldb/unittests/UnwindAssembly/CMakeLists.txt
@@ -9,3 +9,7 @@ endif()
 if ("X86" IN_LIST LLVM_TARGETS_TO_BUILD)
   add_subdirectory(x86)
 endif()
+
+if (NOT "X86" IN_LIST LLVM_TARGETS_TO_BUILD)
+  add_subdirectory(x86-but-no-x86-target)
+endif()
diff --git a/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/CMakeLists.txt b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/CMakeLists.txt
new file mode 100644
index 00000000000000..d28e9629a64cfc
--- /dev/null
+++ b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_lldb_unittest(UnwindAssemblyX86ButNoX86TargetTests
+    Testx86AssemblyInspectionEngine.cpp
+    LINK_LIBS
+      lldbCore
+      lldbSymbol
+      lldbPluginUnwindAssemblyX86
+    LINK_COMPONENTS
+      Support
+      ${LLVM_TARGETS_TO_BUILD}
+  )
diff --git a/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/Testx86AssemblyInspectionEngine.cpp b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/Testx86AssemblyInspectionEngine.cpp
new file mode 100644
index 00000000000000..ed093d146440e3
--- /dev/null
+++ b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/Testx86AssemblyInspectionEngine.cpp
@@ -0,0 +1,103 @@
+//===-- Testx86AssemblyInspectionEngine.cpp -------------------------------===//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+#include "Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.h"
+#include "lldb/Core/AddressRange.h"
+#include "lldb/Symbol/UnwindPlan.h"
+#include "lldb/Utility/ArchSpec.h"
+
+#include "llvm/Support/TargetSelect.h"
+
+#include <memory>
+#include <vector>
+
+using namespace lldb;
+using namespace lldb_private;
+
+class Testx86AssemblyInspectionEngine : public testing::Test {
+public:
+  static void SetUpTestCase();
+};
+
+void Testx86AssemblyInspectionEngine::SetUpTestCase() {
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllAsmPrinters();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllDisassemblers();
+}
+
+// only defining the register names / numbers that the unwinder is actually
+// using today
+
+// names should match the constants below.  These will be the eRegisterKindLLDB
+// register numbers.
+
+const char *x86_64_reg_names[] = {"rax", "rbx", "rcx", "rdx", "rsp", "rbp",
+                                  "rsi", "rdi", "r8",  "r9",  "r10", "r11",
+                                  "r12", "r13", "r14", "r15", "rip"};
+
+enum x86_64_regs {
+  k_rax = 0,
+  k_rbx = 1,
+  k_rcx = 2,
+  k_rdx = 3,
+  k_rsp = 4,
+  k_rbp = 5,
+  k_rsi = 6,
+  k_rdi = 7,
+  k_r8 = 8,
+  k_r9 = 9,
+  k_r10 = 10,
+  k_r11 = 11,
+  k_r12 = 12,
+  k_r13 = 13,
+  k_r14 = 14,
+  k_r15 = 15,
+  k_rip = 16
+};
+
+std::unique_ptr<x86AssemblyInspectionEngine> Getx86_64Inspector() {
+
+  ArchSpec arch("x86_64-apple-macosx");
+  std::unique_ptr<x86AssemblyInspectionEngine> engine(
+      new x86AssemblyInspectionEngine(arch));
+
+  std::vector<x86AssemblyInspectionEngine::lldb_reg_info> lldb_regnums;
+  int i = 0;
+  for (const auto &name : x86_64_reg_names) {
+    x86AssemblyInspectionEngine::lldb_reg_info ri;
+    ri.name = name;
+    ri.lldb_regnum = i++;
+    lldb_regnums.push_back(ri);
+  }
+
+  engine->Initialize(lldb_regnums);
+  return engine;
+}
+
+TEST_F(Testx86AssemblyInspectionEngine, TestSimple64bitFrameFunction) {
+  std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
+
+  // 'int main() { }' compiled for x86_64-apple-macosx with clang
+  uint8_t data[] = {
+      0x55,             // offset 0 -- pushq %rbp
+      0x48, 0x89, 0xe5, // offset 1 -- movq %rsp, %rbp
+      0x31, 0xc0,       // offset 4 -- xorl %eax, %eax
+      0x5d,             // offset 6 -- popq %rbp
+      0xc3              // offset 7 -- retq
+  };
+
+  AddressRange sample_range(0x1000, sizeof(data));
+
+  UnwindPlan unwind_plan(eRegisterKindLLDB);
+  EXPECT_FALSE(engine->GetNonCallSiteUnwindPlanFromAssembly(
+      data, sizeof(data), sample_range, unwind_plan));
+}
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 778ff7e437eb50..8568a7ae90e56c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -443,14 +443,20 @@ def select_constant_cmp: GICombineRule<
 // TODO: handle compares (currently not marked as isCommutable)
 def commute_int_constant_to_rhs : GICombineRule<
   (defs root:$root),
-  (match (wip_match_opcode G_ADD, G_MUL, G_AND, G_OR, G_XOR):$root,
+  (match (wip_match_opcode G_ADD, G_MUL, G_AND, G_OR, G_XOR,
+                           G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_UADDO, G_SADDO,
+                           G_UMULO, G_SMULO, G_UMULH, G_SMULH,
+                           G_UADDSAT, G_SADDSAT, G_SMULFIX, G_UMULFIX,
+                           G_SMULFIXSAT, G_UMULFIXSAT):$root,
     [{ return Helper.matchCommuteConstantToRHS(*${root}); }]),
   (apply [{ Helper.applyCommuteBinOpOperands(*${root}); }])
 >;
 
 def commute_fp_constant_to_rhs : GICombineRule<
   (defs root:$root),
-  (match (wip_match_opcode G_FADD, G_FMUL):$root,
+  (match (wip_match_opcode G_FADD, G_FMUL, G_FMINNUM, G_FMAXNUM,
+                           G_FMINNUM_IEEE, G_FMAXNUM_IEEE,
+                           G_FMINIMUM, G_FMAXIMUM):$root,
     [{ return Helper.matchCommuteFPConstantToRHS(*${root}); }]),
   (apply [{ Helper.applyCommuteBinOpOperands(*${root}); }])
 >;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 40c5119ee7fb3b..3829c33369b275 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6273,8 +6273,21 @@ bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) {
 }
 
 bool CombinerHelper::matchCommuteConstantToRHS(MachineInstr &MI) {
-  Register LHS = MI.getOperand(1).getReg();
-  Register RHS = MI.getOperand(2).getReg();
+  unsigned LHSOpndIdx = 1;
+  unsigned RHSOpndIdx = 2;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_UMULO:
+  case TargetOpcode::G_SMULO:
+    LHSOpndIdx = 2;
+    RHSOpndIdx = 3;
+    break;
+  default:
+    break;
+  }
+  Register LHS = MI.getOperand(LHSOpndIdx).getReg();
+  Register RHS = MI.getOperand(RHSOpndIdx).getReg();
   if (!getIConstantVRegVal(LHS, MRI)) {
     // Skip commuting if LHS is not a constant. But, LHS may be a
     // G_CONSTANT_FOLD_BARRIER. If so we commute as long as we don't already
@@ -6300,10 +6313,23 @@ bool CombinerHelper::matchCommuteFPConstantToRHS(MachineInstr &MI) {
 
 void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) {
   Observer.changingInstr(MI);
-  Register LHSReg = MI.getOperand(1).getReg();
-  Register RHSReg = MI.getOperand(2).getReg();
-  MI.getOperand(1).setReg(RHSReg);
-  MI.getOperand(2).setReg(LHSReg);
+  unsigned LHSOpndIdx = 1;
+  unsigned RHSOpndIdx = 2;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_UMULO:
+  case TargetOpcode::G_SMULO:
+    LHSOpndIdx = 2;
+    RHSOpndIdx = 3;
+    break;
+  default:
+    break;
+  }
+  Register LHSReg = MI.getOperand(LHSOpndIdx).getReg();
+  Register RHSReg = MI.getOperand(RHSOpndIdx).getReg();
+  MI.getOperand(LHSOpndIdx).setReg(RHSReg);
+  MI.getOperand(RHSOpndIdx).setReg(LHSReg);
   Observer.changedInstr(MI);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 57a3f6a65e002c..7a9cfdf5c3fda9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1159,8 +1159,14 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   }
 
   SDValue Unrolled = DAG.UnrollVectorOp(Node);
-  for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I)
-    Results.push_back(Unrolled.getValue(I));
+  if (Node->getNumValues() == 1) {
+    Results.push_back(Unrolled);
+  } else {
+    assert(Node->getNumValues() == Unrolled->getNumValues() &&
+      "VectorLegalizer Expand returned wrong number of results!");
+    for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I)
+      Results.push_back(Unrolled.getValue(I));
+  }
 }
 
 SDValue VectorLegalizer::ExpandSELECT(SDNode *Node) {
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index c742680bcfa76f..73697e85b50b43 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5343,10 +5343,11 @@ MDNode *llvm::upgradeInstructionLoopAttachment(MDNode &N) {
 
 std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
   Triple T(TT);
-  // The only data layout upgrades needed for pre-GCN are setting the address
-  // space of globals to 1.
-  if (T.isAMDGPU() && !T.isAMDGCN() && !DL.contains("-G") &&
-      !DL.starts_with("G")) {
+  // The only data layout upgrades needed for pre-GCN, SPIR or SPIRV are setting
+  // the address space of globals to 1. This does not apply to SPIRV Logical.
+  if (((T.isAMDGPU() && !T.isAMDGCN()) ||
+       (T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical()))) &&
+      !DL.contains("-G") && !DL.starts_with("G")) {
     return DL.empty() ? std::string("G1") : (DL + "-G1").str();
   }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index 05e10a95b157c9..1dda1b89b2d36c 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -101,6 +101,7 @@ class GCNCreateVOPD : public MachineFunctionPass {
       }
     }
 
+    SII->fixImplicitOperands(*VOPDInst);
     for (auto CompIdx : VOPD::COMPONENTS)
       VOPDInst.copyImplicitOps(*MI[CompIdx]);
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 2e143c7556e866..847bf5f8dc8f9a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -621,13 +621,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   // Reserve null register - it shall never be allocated
   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
 
-  // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
-  // will result in bugs.
-  if (isWave32) {
-    Reserved.set(AMDGPU::VCC);
-    Reserved.set(AMDGPU::VCC_HI);
-  }
-
   // Reserve SGPRs.
   //
   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index de492f2b1f0a4f..98f5014a34b1de 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -226,11 +226,8 @@ bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(
       MCFixup::create(0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_align));
   const MCSymbolRefExpr *MCSym = getSecToAlignSym()[Sec];
   if (MCSym == nullptr) {
-    // Create a symbol and make the value of symbol is zero.
-    MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align");
-    Sym->setFragment(&*Sec->getBeginSymbol()->getFragment());
-    Asm.registerSymbol(*Sym);
-    MCSym = MCSymbolRefExpr::create(Sym, Ctx);
+    // Use section symbol directly.
+    MCSym = MCSymbolRefExpr::create(Sec->getBeginSymbol(), Ctx);
     getSecToAlignSym()[Sec] = MCSym;
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 794455aa730400..59962216e0c041 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1226,9 +1226,9 @@ def TuneNoSinkSplatOperands
                        "false", "Disable sink splat operands to enable .vx, .vf,"
                        ".wx, and .wf instructions">;
 
-def TuneNoStripWSuffix
-    : SubtargetFeature<"no-strip-w-suffix", "EnableStripWSuffix", "false",
-                       "Disable strip W suffix">;
+def TunePreferWInst
+    : SubtargetFeature<"prefer-w-inst", "PreferWInst", "true",
+                       "Prefer instructions with W suffix">;
 
 def TuneConditionalCompressedMoveFusion
     : SubtargetFeature<"conditional-cmv-fusion", "HasConditionalCompressedMoveFusion",
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f6ed6420c9e1fa..1d1ea6bae6c105 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10433,14 +10433,10 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
   if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
       getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
     MachineMemOperand *MMO = Load->getMemOperand();
-    MachineFunction &MF = DAG.getMachineFunction();
-    MMO = MF.getMachineMemOperand(
-        MMO, MMO->getPointerInfo(),
-        MMO->getMemoryType().isValid()
-            ? LLT::scalable_vector(1, MMO->getMemoryType().getSizeInBits())
-            : MMO->getMemoryType());
     SDValue NewLoad =
-        DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(), MMO);
+        DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(),
+                    MMO->getPointerInfo(), MMO->getBaseAlign(), MMO->getFlags(),
+                    MMO->getAAInfo(), MMO->getRanges());
     SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
     return DAG.getMergeValues({Result, NewLoad.getValue(1)}, DL);
   }
@@ -10500,14 +10496,9 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
   if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
       getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
     MachineMemOperand *MMO = Store->getMemOperand();
-    MachineFunction &MF = DAG.getMachineFunction();
-    MMO = MF.getMachineMemOperand(
-        MMO, MMO->getPointerInfo(),
-        MMO->getMemoryType().isValid()
-            ? LLT::scalable_vector(1, MMO->getMemoryType().getSizeInBits())
-            : MMO->getMemoryType());
     return DAG.getStore(Store->getChain(), DL, NewValue, Store->getBasePtr(),
-                        MMO);
+                        MMO->getPointerInfo(), MMO->getBaseAlign(),
+                        MMO->getFlags(), MMO->getAAInfo());
   }
 
   SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index b0fda040519a57..668062c8d33f6f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2718,6 +2718,50 @@ std::string RISCVInstrInfo::createMIROperandComment(
   return Comment;
 }
 
+// clang-format off
+#define CASE_RVV_OPCODE_UNMASK_LMUL(OP, LMUL)                                 \
+  RISCV::Pseudo##OP##_##LMUL
+
+#define CASE_RVV_OPCODE_MASK_LMUL(OP, LMUL)                                   \
+  RISCV::Pseudo##OP##_##LMUL##_MASK
+
+#define CASE_RVV_OPCODE_LMUL(OP, LMUL)                                        \
+  CASE_RVV_OPCODE_UNMASK_LMUL(OP, LMUL):                                      \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, LMUL)
+
+#define CASE_RVV_OPCODE_UNMASK_WIDEN(OP)                                      \
+  CASE_RVV_OPCODE_UNMASK_LMUL(OP, MF8):                                       \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, MF4):                                  \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, MF2):                                  \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M1):                                   \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M2):                                   \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M4)
+
+#define CASE_RVV_OPCODE_UNMASK(OP)                                            \
+  CASE_RVV_OPCODE_UNMASK_WIDEN(OP):                                           \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M8)
+
+#define CASE_RVV_OPCODE_MASK_WIDEN(OP)                                        \
+  CASE_RVV_OPCODE_MASK_LMUL(OP, MF8):                                         \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, MF4):                                    \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, MF2):                                    \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, M1):                                     \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, M2):                                     \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, M4)
+
+#define CASE_RVV_OPCODE_MASK(OP)                                              \
+  CASE_RVV_OPCODE_MASK_WIDEN(OP):                                             \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, M8)
+
+#define CASE_RVV_OPCODE_WIDEN(OP)                                             \
+  CASE_RVV_OPCODE_UNMASK_WIDEN(OP):                                           \
+  case CASE_RVV_OPCODE_MASK_WIDEN(OP)
+
+#define CASE_RVV_OPCODE(OP)                                                   \
+  CASE_RVV_OPCODE_UNMASK(OP):                                                 \
+  case CASE_RVV_OPCODE_MASK(OP)
+// clang-format on
+
 // clang-format off
 #define CASE_VMA_OPCODE_COMMON(OP, TYPE, LMUL)                                 \
   RISCV::PseudoV##OP##_##TYPE##_##LMUL
@@ -2798,6 +2842,28 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case RISCV::PseudoCCMOVGPR:
     // Operands 4 and 5 are commutable.
     return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 4, 5);
+  case CASE_RVV_OPCODE(VADD_VV):
+  case CASE_RVV_OPCODE(VAND_VV):
+  case CASE_RVV_OPCODE(VOR_VV):
+  case CASE_RVV_OPCODE(VXOR_VV):
+  case CASE_RVV_OPCODE_MASK(VMSEQ_VV):
+  case CASE_RVV_OPCODE_MASK(VMSNE_VV):
+  case CASE_RVV_OPCODE(VMIN_VV):
+  case CASE_RVV_OPCODE(VMINU_VV):
+  case CASE_RVV_OPCODE(VMAX_VV):
+  case CASE_RVV_OPCODE(VMAXU_VV):
+  case CASE_RVV_OPCODE(VMUL_VV):
+  case CASE_RVV_OPCODE(VMULH_VV):
+  case CASE_RVV_OPCODE(VMULHU_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWADD_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWADDU_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWMUL_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWMULU_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWMACC_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWMACCU_VV):
+  case CASE_RVV_OPCODE_UNMASK(VADC_VVM):
+    // Operands 2 and 3 are commutable.
+    return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
   case CASE_VFMA_SPLATS(FMADD):
   case CASE_VFMA_SPLATS(FMSUB):
   case CASE_VFMA_SPLATS(FMACC):
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index ad1821d57256bc..435cd7f84c6122 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -2127,8 +2127,9 @@ multiclass VPseudoBinary<VReg RetClass,
                          LMULInfo MInfo,
                          string Constraint = "",
                          int sew = 0,
-                         int TargetConstraintType = 1> {
-  let VLMul = MInfo.value, SEW=sew in {
+                         int TargetConstraintType = 1,
+                         bit Commutable = 0> {
+  let VLMul = MInfo.value, SEW=sew, isCommutable = Commutable in {
     defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX);
     def suffix : VPseudoBinaryNoMaskTU<RetClass, Op1Class, Op2Class,
                                        Constraint, TargetConstraintType>;
@@ -2167,8 +2168,9 @@ multiclass VPseudoBinaryM<VReg RetClass,
                           DAGOperand Op2Class,
                           LMULInfo MInfo,
                           string Constraint = "",
-                          int TargetConstraintType = 1> {
-  let VLMul = MInfo.value in {
+                          int TargetConstraintType = 1,
+                          bit Commutable = 0> {
+  let VLMul = MInfo.value, isCommutable = Commutable in {
     def "_" # MInfo.MX : VPseudoBinaryMOutNoMask<RetClass, Op1Class, Op2Class,
                                                  Constraint, TargetConstraintType>;
     let ForceTailAgnostic = true in
@@ -2226,8 +2228,8 @@ multiclass VPseudoTiedBinaryRoundingMode<VReg RetClass,
 }
 
 
-multiclass VPseudoBinaryV_VV<LMULInfo m, string Constraint = "", int sew = 0> {
-  defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint, sew>;
+multiclass VPseudoBinaryV_VV<LMULInfo m, string Constraint = "", int sew = 0, bit Commutable = 0> {
+  defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint, sew, Commutable=Commutable>;
 }
 
 multiclass VPseudoBinaryV_VV_RM<LMULInfo m, string Constraint = ""> {
@@ -2331,9 +2333,10 @@ multiclass VPseudoVALU_MM<bit Commutable = 0> {
 // * The destination EEW is greater than the source EEW, the source EMUL is
 //   at least 1, and the overlap is in the highest-numbered part of the
 //   destination register group is legal. Otherwise, it is illegal.
-multiclass VPseudoBinaryW_VV<LMULInfo m> {
+multiclass VPseudoBinaryW_VV<LMULInfo m, bit Commutable = 0> {
   defm _VV : VPseudoBinary<m.wvrclass, m.vrclass, m.vrclass, m,
-                           "@earlyclobber $rd", TargetConstraintType=3>;
+                           "@earlyclobber $rd", TargetConstraintType=3,
+                           Commutable=Commutable>;
 }
 
 multiclass VPseudoBinaryW_VV_RM<LMULInfo m, int sew = 0> {
@@ -2453,7 +2456,9 @@ multiclass VPseudoBinaryV_VM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
                          m.vrclass, m.vrclass, m, CarryIn, Constraint, TargetConstraintType>;
 }
 
-multiclass VPseudoTiedBinaryV_VM<LMULInfo m, int TargetConstraintType = 1> {
+multiclass VPseudoTiedBinaryV_VM<LMULInfo m, int TargetConstraintType = 1,
+                                 bit Commutable = 0> {
+  let isCommutable = Commutable in
   def "_VVM" # "_" # m.MX:
     VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
                              m.vrclass, m.vrclass, m, 1, "",
@@ -2667,9 +2672,11 @@ multiclass PseudoVEXT_VF8 {
 //  lowest-numbered part of the source register group".
 // With LMUL<=1 the source and dest occupy a single register so any overlap
 // is in the lowest-numbered part.
-multiclass VPseudoBinaryM_VV<LMULInfo m, int TargetConstraintType = 1> {
+multiclass VPseudoBinaryM_VV<LMULInfo m, int TargetConstraintType = 1,
+                             bit Commutable = 0> {
   defm _VV : VPseudoBinaryM<VR, m.vrclass, m.vrclass, m,
-                            !if(!ge(m.octuple, 16), "@earlyclobber $rd", ""), TargetConstraintType>;
+                            !if(!ge(m.octuple, 16), "@earlyclobber $rd", ""),
+                            TargetConstraintType, Commutable=Commutable>;
 }
 
 multiclass VPseudoBinaryM_VX<LMULInfo m, int TargetConstraintType = 1> {
@@ -2751,10 +2758,11 @@ multiclass VPseudoVSSHT_VV_VX_VI_RM<Operand ImmType = simm5, string Constraint =
   }
 }
 
-multiclass VPseudoVALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+multiclass VPseudoVALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = "",
+                                bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m, Constraint>,
+    defm "" : VPseudoBinaryV_VV<m, Constraint, Commutable=Commutable>,
             SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx,
                         forceMergeOpRead=true>;
     defm "" : VPseudoBinaryV_VX<m, Constraint>,
@@ -2804,17 +2812,17 @@ multiclass VPseudoVAALU_VV_VX_RM {
 multiclass VPseudoVMINMAX_VV_VX {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m>,
+    defm "" : VPseudoBinaryV_VV<m, Commutable=1>,
               SchedBinary<"WriteVIMinMaxV", "ReadVIMinMaxV", "ReadVIMinMaxV", mx>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVIMinMaxX", "ReadVIMinMaxV", "ReadVIMinMaxX", mx>;
   }
 }
 
-multiclass VPseudoVMUL_VV_VX {
+multiclass VPseudoVMUL_VV_VX<bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m>,
+    defm "" : VPseudoBinaryV_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVIMulV", "ReadVIMulV", "ReadVIMulV", mx>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVIMulX", "ReadVIMulV", "ReadVIMulX", mx>;
@@ -2964,10 +2972,10 @@ multiclass VPseudoVALU_VX_VI<Operand ImmType = simm5> {
   }
 }
 
-multiclass VPseudoVWALU_VV_VX {
+multiclass VPseudoVWALU_VV_VX<bit Commutable = 0> {
   foreach m = MxListW in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryW_VV<m>,
+    defm "" : VPseudoBinaryW_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVIWALUV", "ReadVIWALUV", "ReadVIWALUV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryW_VX<m>, 
@@ -2976,10 +2984,10 @@ multiclass VPseudoVWALU_VV_VX {
   }
 }
 
-multiclass VPseudoVWMUL_VV_VX {
+multiclass VPseudoVWMUL_VV_VX<bit Commutable = 0> {
   foreach m = MxListW in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryW_VV<m>,
+    defm "" : VPseudoBinaryW_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVIWMulV", "ReadVIWMulV", "ReadVIWMulV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryW_VX<m>,
@@ -3074,7 +3082,7 @@ multiclass VPseudoVMRG_VM_XM_IM {
 multiclass VPseudoVCALU_VM_XM_IM {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoTiedBinaryV_VM<m>,
+    defm "" : VPseudoTiedBinaryV_VM<m, Commutable=1>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoTiedBinaryV_XM<m>,
@@ -3287,10 +3295,10 @@ multiclass VPseudoTernaryV_VF_AAXA_RM<LMULInfo m, FPR_Info f,
                                                           sew, Commutable=1>;
 }
 
-multiclass VPseudoTernaryW_VV<LMULInfo m> {
+multiclass VPseudoTernaryW_VV<LMULInfo m, bit Commutable = 0> {
   defvar constraint = "@earlyclobber $rd";
   defm _VV : VPseudoTernaryWithPolicy<m.wvrclass, m.vrclass, m.vrclass, m,
-                                      constraint, /*Commutable*/ 0, TargetConstraintType=3>;
+                                      constraint, Commutable=Commutable, TargetConstraintType=3>;
 }
 
 multiclass VPseudoTernaryW_VV_RM<LMULInfo m, int sew = 0> {
@@ -3380,10 +3388,10 @@ multiclass VPseudoVSLD_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
   }
 }
 
-multiclass VPseudoVWMAC_VV_VX {
+multiclass VPseudoVWMAC_VV_VX<bit Commutable = 0> {
   foreach m = MxListW in {
     defvar mx = m.MX;
-    defm "" : VPseudoTernaryW_VV<m>,
+    defm "" : VPseudoTernaryW_VV<m, Commutable=Commutable>,
               SchedTernary<"WriteVIWMulAddV", "ReadVIWMulAddV", "ReadVIWMulAddV",
                            "ReadVIWMulAddV", mx>;
     defm "" : VPseudoTernaryW_VX<m>,
@@ -3436,10 +3444,10 @@ multiclass VPseudoVWMAC_VV_VF_BF_RM {
   }
 }
 
-multiclass VPseudoVCMPM_VV_VX_VI {
+multiclass VPseudoVCMPM_VV_VX_VI<bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryM_VV<m, TargetConstraintType=2>,
+    defm "" : VPseudoBinaryM_VV<m, TargetConstraintType=2, Commutable=Commutable>,
               SchedBinary<"WriteVICmpV", "ReadVICmpV", "ReadVICmpV", mx>;
     defm "" : VPseudoBinaryM_VX<m, TargetConstraintType=2>,
               SchedBinary<"WriteVICmpX", "ReadVICmpV", "ReadVICmpX", mx>;
@@ -6248,7 +6256,7 @@ defm PseudoVLSEG : VPseudoUSSegLoadFF;
 //===----------------------------------------------------------------------===//
 // 11.1. Vector Single-Width Integer Add and Subtract
 //===----------------------------------------------------------------------===//
-defm PseudoVADD   : VPseudoVALU_VV_VX_VI;
+defm PseudoVADD   : VPseudoVALU_VV_VX_VI<Commutable=1>;
 defm PseudoVSUB   : VPseudoVALU_VV_VX;
 defm PseudoVRSUB  : VPseudoVALU_VX_VI;
 
@@ -6313,9 +6321,9 @@ foreach vti = AllIntegerVectors in {
 //===----------------------------------------------------------------------===//
 // 11.2. Vector Widening Integer Add/Subtract
 //===----------------------------------------------------------------------===//
-defm PseudoVWADDU : VPseudoVWALU_VV_VX;
+defm PseudoVWADDU : VPseudoVWALU_VV_VX<Commutable=1>;
 defm PseudoVWSUBU : VPseudoVWALU_VV_VX;
-defm PseudoVWADD  : VPseudoVWALU_VV_VX;
+defm PseudoVWADD  : VPseudoVWALU_VV_VX<Commutable=1>;
 defm PseudoVWSUB  : VPseudoVWALU_VV_VX;
 defm PseudoVWADDU : VPseudoVWALU_WV_WX;
 defm PseudoVWSUBU : VPseudoVWALU_WV_WX;
@@ -6346,9 +6354,9 @@ defm PseudoVMSBC : VPseudoVCALUM_V_X<"@earlyclobber $rd">;
 //===----------------------------------------------------------------------===//
 // 11.5. Vector Bitwise Logical Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVAND : VPseudoVALU_VV_VX_VI;
-defm PseudoVOR  : VPseudoVALU_VV_VX_VI;
-defm PseudoVXOR : VPseudoVALU_VV_VX_VI;
+defm PseudoVAND : VPseudoVALU_VV_VX_VI<Commutable=1>;
+defm PseudoVOR  : VPseudoVALU_VV_VX_VI<Commutable=1>;
+defm PseudoVXOR : VPseudoVALU_VV_VX_VI<Commutable=1>;
 
 //===----------------------------------------------------------------------===//
 // 11.6. Vector Single-Width Bit Shift Instructions
@@ -6366,8 +6374,8 @@ defm PseudoVNSRA : VPseudoVNSHT_WV_WX_WI;
 //===----------------------------------------------------------------------===//
 // 11.8. Vector Integer Comparison Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMSEQ  : VPseudoVCMPM_VV_VX_VI;
-defm PseudoVMSNE  : VPseudoVCMPM_VV_VX_VI;
+defm PseudoVMSEQ  : VPseudoVCMPM_VV_VX_VI<Commutable=1>;
+defm PseudoVMSNE  : VPseudoVCMPM_VV_VX_VI<Commutable=1>;
 defm PseudoVMSLTU : VPseudoVCMPM_VV_VX;
 defm PseudoVMSLT  : VPseudoVCMPM_VV_VX;
 defm PseudoVMSLEU : VPseudoVCMPM_VV_VX_VI;
@@ -6386,9 +6394,9 @@ defm PseudoVMAX  : VPseudoVMINMAX_VV_VX;
 //===----------------------------------------------------------------------===//
 // 11.10. Vector Single-Width Integer Multiply Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMUL    : VPseudoVMUL_VV_VX;
-defm PseudoVMULH   : VPseudoVMUL_VV_VX;
-defm PseudoVMULHU  : VPseudoVMUL_VV_VX;
+defm PseudoVMUL    : VPseudoVMUL_VV_VX<Commutable=1>;
+defm PseudoVMULH   : VPseudoVMUL_VV_VX<Commutable=1>;
+defm PseudoVMULHU  : VPseudoVMUL_VV_VX<Commutable=1>;
 defm PseudoVMULHSU : VPseudoVMUL_VV_VX;
 
 //===----------------------------------------------------------------------===//
@@ -6402,8 +6410,8 @@ defm PseudoVREM  : VPseudoVDIV_VV_VX;
 //===----------------------------------------------------------------------===//
 // 11.12. Vector Widening Integer Multiply Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVWMUL   : VPseudoVWMUL_VV_VX;
-defm PseudoVWMULU  : VPseudoVWMUL_VV_VX;
+defm PseudoVWMUL   : VPseudoVWMUL_VV_VX<Commutable=1>;
+defm PseudoVWMULU  : VPseudoVWMUL_VV_VX<Commutable=1>;
 defm PseudoVWMULSU : VPseudoVWMUL_VV_VX;
 
 //===----------------------------------------------------------------------===//
@@ -6417,8 +6425,8 @@ defm PseudoVNMSUB : VPseudoVMAC_VV_VX_AAXA;
 //===----------------------------------------------------------------------===//
 // 11.14. Vector Widening Integer Multiply-Add Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVWMACCU  : VPseudoVWMAC_VV_VX;
-defm PseudoVWMACC   : VPseudoVWMAC_VV_VX;
+defm PseudoVWMACCU  : VPseudoVWMAC_VV_VX<Commutable=1>;
+defm PseudoVWMACC   : VPseudoVWMAC_VV_VX<Commutable=1>;
 defm PseudoVWMACCSU : VPseudoVWMAC_VV_VX;
 defm PseudoVWMACCUS : VPseudoVWMAC_VX;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 9a6818c99af206..71aa1f19e089a9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -307,10 +307,16 @@ multiclass VPseudoVC_X<LMULInfo m, DAGOperand RS1Class,
                        Operand OpClass = payload2> {
   let VLMul = m.value in {
     let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
-      def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_X<OpClass, RS1Class>;
-      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>;
+      def "PseudoVC_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_X<OpClass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
     }
-    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>;
+    def "PseudoVC_V_" # NAME # "_" # m.MX
+      : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>,
+        Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
   }
 }
 
@@ -318,10 +324,16 @@ multiclass VPseudoVC_XV<LMULInfo m, DAGOperand RS1Class,
                         Operand OpClass = payload2> {
   let VLMul = m.value in {
     let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
-      def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XV<OpClass, m.vrclass, RS1Class>;
-      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+      def "PseudoVC_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_XV<OpClass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
     }
-    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+    def "PseudoVC_V_" # NAME # "_" # m.MX
+      : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>,
+        Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
   }
 }
 
@@ -329,10 +341,16 @@ multiclass VPseudoVC_XVV<LMULInfo m, DAGOperand RS1Class,
                          Operand OpClass = payload2> {
   let VLMul = m.value in {
     let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
-      def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
-      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+      def "PseudoVC_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
     }
-    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+    def "PseudoVC_V_" # NAME # "_" # m.MX
+      : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>,
+        Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
   }
 }
 
@@ -340,11 +358,17 @@ multiclass VPseudoVC_XVW<LMULInfo m, DAGOperand RS1Class,
                          Operand OpClass = payload2> {
   let VLMul = m.value in {
     let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in
-    def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
+    def "PseudoVC_" # NAME # "_SE_" # m.MX
+      : VPseudoVC_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>,
+        Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
     let Constraints = "@earlyclobber $rd, $rd = $rs3" in {
       let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in
-      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
-      def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
+      def "PseudoVC_V_" # NAME # "_" # m.MX
+        : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
     }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
index dd13a07d606d04..32e7f962aa2ab0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
@@ -20,13 +20,7 @@ class CMOPInst<bits<3> imm3, string opcodestr>
   let Inst{12-11} = 0;
 }
 
-// CMOP1, CMOP5 is used by Zicfiss.
-let Predicates = [HasStdExtZcmop, NoHasStdExtZicfiss] in {
-  def CMOP1 : CMOPInst<0, "cmop.1">, Sched<[]>;
-  def CMOP5 : CMOPInst<2, "cmop.5">, Sched<[]>;
-}
-
-foreach n = [3, 7, 9, 11, 13, 15] in {
+foreach n = [1, 3, 5, 7, 9, 11, 13, 15] in {
   let Predicates = [HasStdExtZcmop] in
-  def CMOP # n : CMOPInst<!srl(n, 1), "cmop." # n>, Sched<[]>;
+  def C_MOP # n : CMOPInst<!srl(n, 1), "c.mop." # n>, Sched<[]>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 39d420c2fbf080..ead91c5656be8b 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -12,15 +12,24 @@
 // extended bits aren't consumed or because the input was already sign extended
 // by an earlier instruction.
 //
-// Then it removes the -w suffix from opw instructions whenever all users are
-// dependent only on the lower word of the result of the instruction.
-// The cases handled are:
-// * addw because c.add has a larger register encoding than c.addw.
-// * addiw because it helps reduce test differences between RV32 and RV64
-//   w/o being a pessimization.
-// * mulw because c.mulw doesn't exist but c.mul does (w/ zcb)
-// * slliw because c.slliw doesn't exist and c.slli does
+// Then:
+// 1. Unless explicit disabled or the target prefers instructions with W suffix,
+//    it removes the -w suffix from opw instructions whenever all users are
+//    dependent only on the lower word of the result of the instruction.
+//    The cases handled are:
+//    * addw because c.add has a larger register encoding than c.addw.
+//    * addiw because it helps reduce test differences between RV32 and RV64
+//      w/o being a pessimization.
+//    * mulw because c.mulw doesn't exist but c.mul does (w/ zcb)
+//    * slliw because c.slliw doesn't exist and c.slli does
 //
+// 2. Or if explicit enabled or the target prefers instructions with W suffix,
+//    it adds the W suffix to the instruction whenever all users are dependent
+//    only on the lower word of the result of the instruction.
+//    The cases handled are:
+//    * add/addi/sub/mul.
+//    * slli with imm < 32.
+//    * ld/lwu.
 //===---------------------------------------------------------------------===//
 
 #include "RISCV.h"
@@ -60,6 +69,8 @@ class RISCVOptWInstrs : public MachineFunctionPass {
                          const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
   bool stripWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
                       const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
+  bool appendWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
+                       const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -672,9 +683,6 @@ bool RISCVOptWInstrs::stripWSuffixes(MachineFunction &MF,
                                      const RISCVInstrInfo &TII,
                                      const RISCVSubtarget &ST,
                                      MachineRegisterInfo &MRI) {
-  if (DisableStripWSuffix || !ST.enableStripWSuffix())
-    return false;
-
   bool MadeChange = false;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
@@ -698,6 +706,58 @@ bool RISCVOptWInstrs::stripWSuffixes(MachineFunction &MF,
   return MadeChange;
 }
 
+bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
+                                      const RISCVInstrInfo &TII,
+                                      const RISCVSubtarget &ST,
+                                      MachineRegisterInfo &MRI) {
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned WOpc;
+      // TODO: Add more?
+      switch (MI.getOpcode()) {
+      default:
+        continue;
+      case RISCV::ADD:
+        WOpc = RISCV::ADDW;
+        break;
+      case RISCV::ADDI:
+        WOpc = RISCV::ADDIW;
+        break;
+      case RISCV::SUB:
+        WOpc = RISCV::SUBW;
+        break;
+      case RISCV::MUL:
+        WOpc = RISCV::MULW;
+        break;
+      case RISCV::SLLI:
+        // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits
+        if (MI.getOperand(2).getImm() >= 32)
+          continue;
+        WOpc = RISCV::SLLIW;
+        break;
+      case RISCV::LD:
+      case RISCV::LWU:
+        WOpc = RISCV::LW;
+        break;
+      }
+
+      if (hasAllWUsers(MI, ST, MRI)) {
+        LLVM_DEBUG(dbgs() << "Replacing " << MI);
+        MI.setDesc(TII.get(WOpc));
+        MI.clearFlag(MachineInstr::MIFlag::NoSWrap);
+        MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
+        MI.clearFlag(MachineInstr::MIFlag::IsExact);
+        LLVM_DEBUG(dbgs() << "     with " << MI);
+        ++NumTransformedToWInstrs;
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
 bool RISCVOptWInstrs::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -711,7 +771,12 @@ bool RISCVOptWInstrs::runOnMachineFunction(MachineFunction &MF) {
 
   bool MadeChange = false;
   MadeChange |= removeSExtWInstrs(MF, TII, ST, MRI);
-  MadeChange |= stripWSuffixes(MF, TII, ST, MRI);
+
+  if (!(DisableStripWSuffix || ST.preferWInst()))
+    MadeChange |= stripWSuffixes(MF, TII, ST, MRI);
+
+  if (ST.preferWInst())
+    MadeChange |= appendWSuffixes(MF, TII, ST, MRI);
 
   return MadeChange;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index fd6d6078ec238b..739b50749e1323 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -56,11 +56,13 @@ class RISCVTuneProcessorModel<string n,
 
 def GENERIC_RV32 : RISCVProcessorModel<"generic-rv32",
                                        NoSchedModel,
-                                       [Feature32Bit]>,
+                                       [Feature32Bit,
+                                        FeatureStdExtI]>,
                    GenericTuneInfo;
 def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64",
                                        NoSchedModel,
-                                       [Feature64Bit]>,
+                                       [Feature64Bit,
+                                        FeatureStdExtI]>,
                    GenericTuneInfo;
 // Support generic for compatibility with other targets. The triple will be used
 // to change to the appropriate rv32/rv64 version.
@@ -69,11 +71,13 @@ def : ProcessorModel<"generic", NoSchedModel, []>, GenericTuneInfo;
 def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32",
                                       RocketModel,
                                       [Feature32Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZicsr]>;
 def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
                                       RocketModel,
                                       [Feature64Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZicsr]>;
 def ROCKET : RISCVTuneProcessorModel<"rocket",
@@ -86,6 +90,7 @@ def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series",
 def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
                                      RocketModel,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZicsr,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
@@ -94,6 +99,7 @@ def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
 def SIFIVE_E21 : RISCVProcessorModel<"sifive-e21",
                                      RocketModel,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZicsr,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
@@ -103,6 +109,7 @@ def SIFIVE_E21 : RISCVProcessorModel<"sifive-e21",
 def SIFIVE_E24 : RISCVProcessorModel<"sifive-e24",
                                      RocketModel,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -112,6 +119,7 @@ def SIFIVE_E24 : RISCVProcessorModel<"sifive-e24",
 def SIFIVE_E31 : RISCVProcessorModel<"sifive-e31",
                                      RocketModel,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtZicsr,
                                       FeatureStdExtM,
@@ -121,6 +129,7 @@ def SIFIVE_E31 : RISCVProcessorModel<"sifive-e31",
 def SIFIVE_E34 : RISCVProcessorModel<"sifive-e34",
                                      RocketModel,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -130,6 +139,7 @@ def SIFIVE_E34 : RISCVProcessorModel<"sifive-e34",
 def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
                                      SiFive7Model,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -140,6 +150,7 @@ def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
 def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
                                      RocketModel,
                                      [Feature64Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZicsr,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
@@ -149,6 +160,7 @@ def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
 def SIFIVE_S51 : RISCVProcessorModel<"sifive-s51",
                                      RocketModel,
                                      [Feature64Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZicsr,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
@@ -158,6 +170,7 @@ def SIFIVE_S51 : RISCVProcessorModel<"sifive-s51",
 def SIFIVE_S54 : RISCVProcessorModel<"sifive-s54",
                                       RocketModel,
                                       [Feature64Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtM,
                                        FeatureStdExtA,
@@ -168,6 +181,7 @@ def SIFIVE_S54 : RISCVProcessorModel<"sifive-s54",
 def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
                                      SiFive7Model,
                                      [Feature64Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -180,6 +194,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
 def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
                                      RocketModel,
                                      [Feature64Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -190,6 +205,7 @@ def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
 def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
                                      SiFive7Model,
                                      [Feature64Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -200,6 +216,7 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
 
 def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                       [Feature64Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtM,
                                        FeatureStdExtA,
@@ -217,6 +234,7 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
 
 def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                       [Feature64Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtM,
                                        FeatureStdExtA,
@@ -247,6 +265,7 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
 
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       [Feature64Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtM,
                                        FeatureStdExtA,
@@ -286,6 +305,7 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
 def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
                                               SyntacoreSCR1Model,
                                               [Feature32Bit,
+                                               FeatureStdExtI,
                                                FeatureStdExtZicsr,
                                                FeatureStdExtZifencei,
                                                FeatureStdExtC],
@@ -294,6 +314,7 @@ def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
 def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max",
                                              SyntacoreSCR1Model,
                                              [Feature32Bit,
+                                              FeatureStdExtI,
                                               FeatureStdExtZicsr,
                                               FeatureStdExtZifencei,
                                               FeatureStdExtM,
@@ -303,6 +324,7 @@ def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max",
 def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
                                             NoSchedModel,
                                             [Feature64Bit,
+                                             FeatureStdExtI,
                                              FeatureStdExtZifencei,
                                              FeatureStdExtZicsr,
                                              FeatureStdExtZicntr,
@@ -332,6 +354,7 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
 def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
                                           XiangShanNanHuModel,
                                           [Feature64Bit,
+                                           FeatureStdExtI,
                                            FeatureStdExtZicsr,
                                            FeatureStdExtZifencei,
                                            FeatureStdExtM,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
index e74c7aab7474da..65494e73758d63 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -261,4 +261,5 @@ defm : UnsupportedSchedZbkx;
 defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedZfh;
 defm : UnsupportedSchedSFB;
+defm : UnsupportedSchedXsfvcp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 4dcec96df9d576..a532066b3a1c83 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -962,6 +962,54 @@ let Latency = 3 in
 
 def : InstRW<[WriteIALU], (instrs COPY)>;
 
+// VCIX
+//
+// In principle we don't know the latency of any VCIX instructions. But instead
+// of taking the default of 1, which can lead to issues [1], we assume that they
+// have a fairly high latency.
+//
+// [1] https://github.com/llvm/llvm-project/issues/83391
+foreach mx = SchedMxList in {
+  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = !mul(Cycles, 10),
+      AcquireAtCycles = [0, 1],
+      ReleaseAtCycles = [1, !add(1, Cycles)] in {
+    defm "" : LMULWriteResMX<"WriteVC_V_I",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_X",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_IV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_VV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_XV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_IVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_IVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_VVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_VVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_XVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_XVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    foreach f = ["FPR16", "FPR32", "FPR64"] in {
+      defm "" : LMULWriteResMX<"WriteVC_V_" # f # "V",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVC_V_" # f # "VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVC_V_" # f # "VW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    }
+    defm "" : LMULWriteResMX<"WriteVC_I",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_X",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_IV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_VV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_XV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_IVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_IVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_VVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_VVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_XVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_XVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    foreach f = ["FPR16", "FPR32", "FPR64"] in {
+      defm "" : LMULWriteResMX<"WriteVC_" # f # "V",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVC_" # f # "VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVC_" # f # "VW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 
 // Bypass and advance
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
index 8ec2e4ff885ebb..fccdd7e4f3ec2e 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -366,4 +366,5 @@ defm : UnsupportedSchedZbkx;
 defm : UnsupportedSchedSFB;
 defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedV;
+defm : UnsupportedSchedXsfvcp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
index 0ced2efa3f7abf..6e4fb19361f553 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -1040,4 +1040,5 @@ defm : UnsupportedSchedZbkb;
 defm : UnsupportedSchedZbkx;
 defm : UnsupportedSchedSFB;
 defm : UnsupportedSchedZfa;
+defm : UnsupportedSchedXsfvcp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
index 9625d17e0b2600..0885e325f24e68 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
@@ -212,4 +212,5 @@ defm : UnsupportedSchedZbkb;
 defm : UnsupportedSchedZbkx;
 defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedZfh;
+defm : UnsupportedSchedXsfvcp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
index 4fc7b0335af538..e0f1fab1d6b409 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
@@ -311,4 +311,5 @@ defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedZfh;
 defm : UnsupportedSchedSFB;
 defm : UnsupportedSchedZabha;
+defm : UnsupportedSchedXsfvcp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index 1d19624342d2bb..0086557a41fe7c 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -296,3 +296,4 @@ def : ReadAdvance<ReadAtomicHD, 0>;
 // Include the scheduler resources for other instruction extensions.
 include "RISCVScheduleZb.td"
 include "RISCVScheduleV.td"
+include "RISCVScheduleXSf.td"
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
new file mode 100644
index 00000000000000..58d508460f0190
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
@@ -0,0 +1,59 @@
+//===-- RISCVScheduleXSf.td - Scheduling Definitions XSf ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the scheduling information for SiFive extensions.
+//
+//===----------------------------------------------------------------------===//
+
+multiclass LMULSchedWritesVCIX<string id>{
+defm "" : LMULSchedWrites<"WriteVC_" # id>;
+defm "" : LMULSchedWrites<"WriteVC_V_" # id>;
+}
+
+defm "" : LMULSchedWritesVCIX<"I">;
+defm "" : LMULSchedWritesVCIX<"X">;
+defm "" : LMULSchedWritesVCIX<"IV">;
+defm "" : LMULSchedWritesVCIX<"VV">;
+defm "" : LMULSchedWritesVCIX<"XV">;
+defm "" : LMULSchedWritesVCIX<"IVV">;
+defm "" : LMULSchedWritesVCIX<"IVW">;
+defm "" : LMULSchedWritesVCIX<"VVV">;
+defm "" : LMULSchedWritesVCIX<"VVW">;
+defm "" : LMULSchedWritesVCIX<"XVV">;
+defm "" : LMULSchedWritesVCIX<"XVW">;
+foreach f = ["FPR16", "FPR32", "FPR64"] in {
+  defm "" : LMULSchedWritesVCIX<f # "V">;
+  defm "" : LMULSchedWritesVCIX<f # "VV">;
+  defm "" : LMULSchedWritesVCIX<f # "VW">;
+}
+
+multiclass LMULWriteResVCIX<string id, list<ProcResourceKind> resources>{
+defm : LMULWriteRes<"WriteVC_" # id, resources>;
+defm : LMULWriteRes<"WriteVC_V_" # id, resources>;
+}
+
+multiclass UnsupportedSchedXsfvcp {
+let Unsupported = true in {
+defm : LMULWriteResVCIX<"I", []>;
+defm : LMULWriteResVCIX<"X", []>;
+defm : LMULWriteResVCIX<"IV", []>;
+defm : LMULWriteResVCIX<"VV", []>;
+defm : LMULWriteResVCIX<"XV", []>;
+defm : LMULWriteResVCIX<"IVV", []>;
+defm : LMULWriteResVCIX<"IVW", []>;
+defm : LMULWriteResVCIX<"VVV", []>;
+defm : LMULWriteResVCIX<"VVW", []>;
+defm : LMULWriteResVCIX<"XVV", []>;
+defm : LMULWriteResVCIX<"XVW", []>;
+foreach f = ["FPR16", "FPR32", "FPR64"] in {
+  defm : LMULWriteResVCIX<f # "V", []>;
+  defm : LMULWriteResVCIX<f # "VV", []>;
+  defm : LMULWriteResVCIX<f # "VW", []>;
+}
+}
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index fbf64f2b1dfb13..ae8baa3f119132 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -55,9 +55,9 @@ static std::string computeDataLayout(const Triple &TT) {
   // mean anything.
   if (Arch == Triple::spirv32)
     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
-           "v96:128-v192:256-v256:256-v512:512-v1024:1024";
+           "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1";
   return "e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-         "v96:128-v192:256-v256:256-v512:512-v1024:1024";
+         "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1";
 }
 
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index bae8579fc3650b..ba5db854647a42 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1774,6 +1774,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Instruction *I = moveAddAfterMinMax(II, Builder))
       return I;
 
+    // minmax (X & NegPow2C, Y & NegPow2C) --> minmax(X, Y) & NegPow2C
+    const APInt *RHSC;
+    if (match(I0, m_OneUse(m_And(m_Value(X), m_NegatedPower2(RHSC)))) &&
+        match(I1, m_OneUse(m_And(m_Value(Y), m_SpecificInt(*RHSC)))))
+      return BinaryOperator::CreateAnd(Builder.CreateBinaryIntrinsic(IID, X, Y),
+                                       ConstantInt::get(II->getType(), *RHSC));
+
     // smax(X, -X) --> abs(X)
     // smin(X, -X) --> -abs(X)
     // umax(X, -X) --> -abs(X)
@@ -1815,7 +1822,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
        return NewMinMax;
 
     // Try to fold minmax with constant RHS based on range information
-    const APInt *RHSC;
     if (match(I1, m_APIntAllowUndef(RHSC))) {
       ICmpInst::Predicate Pred =
           ICmpInst::getNonStrictPredicate(MinMaxIntrinsic::getPredicate(IID));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 90550cdbdf8911..ee783eed190a7c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -8097,6 +8097,14 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
         return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I);
   }
 
+  // fcmp (fadd X, 0.0), Y --> fcmp X, Y
+  if (match(Op0, m_FAdd(m_Value(X), m_AnyZeroFP())))
+    return new FCmpInst(Pred, X, Op1, "", &I);
+
+  // fcmp X, (fadd Y, 0.0) --> fcmp X, Y
+  if (match(Op1, m_FAdd(m_Value(Y), m_AnyZeroFP())))
+    return new FCmpInst(Pred, Op0, Y, "", &I);
+
   if (match(Op0, m_FPExt(m_Value(X)))) {
     // fcmp (fpext X), (fpext Y) -> fcmp X, Y
     if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 380bac9c618077..a42ef0c4e6ae9e 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3627,10 +3627,12 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
     return createIntegerExpression(C);
 
   auto *FP = dyn_cast<ConstantFP>(&C);
-  if (FP && (Ty.isFloatTy() || Ty.isDoubleTy())) {
+  if (FP && Ty.isFloatingPointTy() && Ty.getScalarSizeInBits() <= 64) {
     const APFloat &APF = FP->getValueAPF();
-    return DIB.createConstantValueExpression(
-        APF.bitcastToAPInt().getZExtValue());
+    APInt const &API = APF.bitcastToAPInt();
+    if (auto Temp = API.getZExtValue())
+      return DIB.createConstantValueExpression(static_cast<uint64_t>(Temp));
+    return DIB.createConstantValueExpression(*API.getRawData());
   }
 
   if (!Ty.isPointerTy())
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
index 76d82884a7b1f1..d791660b7a5eb2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
@@ -116,3 +116,129 @@ body:             |
     $q0 = COPY %mul
     RET_ReallyLR
 ...
+---
+name:            fminnum
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fminnum
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %min:_(s32) = G_FMINNUM [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %min(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %min:_(s32) = G_FMINNUM %cst, %0
+    $s0 = COPY %min
+    RET_ReallyLR
+...
+---
+name:            fmaxnum
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fmaxnum
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %max:_(s32) = G_FMAXNUM [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %max(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %max:_(s32) = G_FMAXNUM %cst, %0
+    $s0 = COPY %max
+    RET_ReallyLR
+...
+---
+name:            fminnum_ieee
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fminnum_ieee
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %min:_(s32) = G_FMINNUM_IEEE [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %min(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %min:_(s32) = G_FMINNUM_IEEE %cst, %0
+    $s0 = COPY %min
+    RET_ReallyLR
+...
+---
+name:            fmaxnum_ieee
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fmaxnum_ieee
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %max:_(s32) = G_FMAXNUM_IEEE [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %max(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %max:_(s32) = G_FMAXNUM_IEEE %cst, %0
+    $s0 = COPY %max
+    RET_ReallyLR
+...
+---
+name:            fminimum
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fminimum
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %min:_(s32) = G_FMINIMUM [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %min(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %min:_(s32) = G_FMINIMUM %cst, %0
+    $s0 = COPY %min
+    RET_ReallyLR
+...
+---
+name:            fmaximum
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fmaximum
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %max:_(s32) = G_FMAXIMUM [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %max(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %max:_(s32) = G_FMAXIMUM %cst, %0
+    $s0 = COPY %max
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir
new file mode 100644
index 00000000000000..16365494f5f4ec
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir
@@ -0,0 +1,456 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+---
+name:            add
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: add
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %add:_(s32) = G_ADD [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %add:_(s32) = G_ADD %cst, %0
+    $s0 = COPY %add
+    RET_ReallyLR
+
+...
+---
+name:            mul
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: mul
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_MUL [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_MUL %cst, %0
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            and
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: and
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %and:_(s32) = G_AND [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %and(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 5
+    %and:_(s32) = G_AND %cst, %0
+    $s0 = COPY %and
+    RET_ReallyLR
+...
+---
+name:            or
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: or
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %or:_(s32) = G_OR [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %or(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 5
+    %or:_(s32) = G_OR %cst, %0
+    $s0 = COPY %or
+    RET_ReallyLR
+...
+---
+name:            xor
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: xor
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %xor:_(s32) = G_XOR [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %xor(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 5
+    %xor:_(s32) = G_XOR %cst, %0
+    $s0 = COPY %xor
+    RET_ReallyLR
+...
+---
+name:            smin
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smin
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: %min:_(s32) = G_SMIN [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %min(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 10
+    %min:_(s32) = G_SMIN %cst, %0
+    $s0 = COPY %min
+    RET_ReallyLR
+...
+---
+name:            smax
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smax
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: %max:_(s32) = G_SMAX [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %max(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 10
+    %max:_(s32) = G_SMAX %cst, %0
+    $s0 = COPY %max
+    RET_ReallyLR
+...
+---
+name:            umin
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umin
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: %min:_(s32) = G_UMIN [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %min(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 10
+    %min:_(s32) = G_UMIN %cst, %0
+    $s0 = COPY %min
+    RET_ReallyLR
+...
+---
+name:            umax
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umax
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: %max:_(s32) = G_UMAX [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %max(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 10
+    %max:_(s32) = G_UMAX %cst, %0
+    $s0 = COPY %max
+    RET_ReallyLR
+...
+---
+name:            uaddo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: uaddo
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %add:_(s32), %overflow:_(s1) = G_UADDO [[COPY]], %cst
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $s0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %add:_(s32), %overflow:_(s1) = G_UADDO %cst, %0
+    %ret:_(s32) = G_ANYEXT %overflow
+    $s0 = COPY %ret
+    RET_ReallyLR
+
+...
+---
+name:            saddo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: saddo
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %add:_(s32), %overflow:_(s1) = G_SADDO [[COPY]], %cst
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $s0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %add:_(s32), %overflow:_(s1) = G_SADDO %cst, %0
+    %ret:_(s32) = G_ANYEXT %overflow
+    $s0 = COPY %ret
+    RET_ReallyLR
+
+...
+---
+name:            umulo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umulo
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_UMULO [[COPY]], %cst
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $s0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32), %overflow:_(s1) = G_UMULO %cst, %0
+    %ret:_(s32) = G_ANYEXT %overflow
+    $s0 = COPY %ret
+    RET_ReallyLR
+...
+---
+name:            smulo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smulo
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO [[COPY]], %cst
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $s0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32), %overflow:_(s1) = G_SMULO %cst, %0
+    %ret:_(s32) = G_ANYEXT %overflow
+    $s0 = COPY %ret
+    RET_ReallyLR
+...
+---
+name:            umulh
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umulh
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_UMULH [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_UMULH %cst, %0
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            smulh
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smulh
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_UMULH [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_UMULH %cst, %0
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            uaddsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: uaddsat
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %add:_(s32) = G_UADDSAT [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %add:_(s32) = G_UADDSAT %cst, %0
+    $s0 = COPY %add
+    RET_ReallyLR
+
+...
+---
+name:            saddsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: saddsat
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %add:_(s32) = G_SADDSAT [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %add:_(s32) = G_SADDSAT %cst, %0
+    $s0 = COPY %add
+    RET_ReallyLR
+
+...
+---
+name:            smulfix
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smulfix
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_SMULFIX [[COPY]], %cst, 7
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_SMULFIX %cst, %0, 7
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            umulfix
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umulfix
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_UMULFIX [[COPY]], %cst, 7
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_UMULFIX %cst, %0, 7
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            smulfixsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smulfixsat
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_SMULFIXSAT [[COPY]], %cst, 7
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_SMULFIXSAT %cst, %0, 7
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            umulfixsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umulfixsat
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_UMULFIXSAT [[COPY]], %cst, 7
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_UMULFIXSAT %cst, %0, 7
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir
index 01e0dce5a661cb..c967e4f2ea5e8c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir
@@ -78,3 +78,163 @@ body:             |
     RET_ReallyLR
 
 ...
+---
+name:            cfb_lhs_smulo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_smulo
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO [[COPY]], %cfb
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $w0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %0
+    %ret:_(s32) = G_ANYEXT %overflow
+    $w0 = COPY %ret
+    RET_ReallyLR
+
+...
+---
+name:            cfb_lhs_cfb_already_rhs_smulo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_cfb_already_rhs_smulo
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 6
+    ; CHECK-NEXT: %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2
+    ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cfb2
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $w0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %cst2:_(s32) = G_CONSTANT i32 6
+    %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2
+    %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cfb2
+    %ret:_(s32) = G_ANYEXT %overflow
+    $w0 = COPY %ret
+    RET_ReallyLR
+
+...
+---
+name:            cfb_lhs_cst_on_rhs_smulo
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_cst_on_rhs_smulo
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 6
+    ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cst2
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $w0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %cst2:_(s32) = G_CONSTANT i32 6
+    %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cst2
+    %ret:_(s32) = G_ANYEXT %overflow
+    $w0 = COPY %ret
+    RET_ReallyLR
+
+...
+---
+name:            cfb_lhs_umulfixsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_umulfixsat
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %mul:_(s32) = G_UMULFIXSAT [[COPY]], %cfb, 7
+    ; CHECK-NEXT: $w0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %mul:_(s32) = G_UMULFIXSAT %cfb, %0, 7
+    $w0 = COPY %mul
+    RET_ReallyLR
+
+...
+---
+name:            cfb_lhs_cfb_already_rhs_umulfixsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_cfb_already_rhs_umulfixsat
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2
+    ; CHECK-NEXT: %add:_(s32) = G_UMULFIXSAT %cfb, %cfb2, 7
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %cst2:_(s32) = G_CONSTANT i32 2
+    %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2
+    %add:_(s32) = G_UMULFIXSAT %cfb, %cfb2, 7
+    $w0 = COPY %add
+    RET_ReallyLR
+
+...
+---
+name:            cfb_lhs_cst_on_rhs_umulfixsat
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_cst_on_rhs_umulfixsat
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: %add:_(s32) = G_UMULFIXSAT %cfb, %cst2, 7
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %cst2:_(s32) = G_CONSTANT i32 2
+    %add:_(s32) = G_UMULFIXSAT %cfb, %cst2, 7
+    $w0 = COPY %add
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir
index 8c4300d9e7329f..03e507f5eaa7fb 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir
@@ -11,7 +11,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[C]], [[COPY]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[COPY]], [[C]]
     ; CHECK-NEXT: $h0 = COPY [[FMAXIMUM]](s16)
     ; CHECK-NEXT: RET_ReallyLR implicit $h0
     %0:_(s16) = COPY $h0
@@ -33,7 +33,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[C]], [[COPY]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[C]]
     ; CHECK-NEXT: $s0 = COPY [[FMAXIMUM]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $s0
     %0:_(s32) = COPY $s0
@@ -55,7 +55,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[C]], [[COPY]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[COPY]], [[C]]
     ; CHECK-NEXT: $d0 = COPY [[FMAXIMUM]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(s64) = COPY $d0
@@ -77,7 +77,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
-    ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[C]], [[COPY]]
+    ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[COPY]], [[C]]
     ; CHECK-NEXT: $d0 = COPY [[FMINIMUM]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(s64) = COPY $d0
@@ -100,7 +100,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<8 x s16>) = G_FMAXIMUM [[BUILD_VECTOR]], [[COPY]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<8 x s16>) = G_FMAXIMUM [[COPY]], [[BUILD_VECTOR]]
     ; CHECK-NEXT: $q0 = COPY [[FMAXIMUM]](<8 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<8 x s16>) = COPY $q0
@@ -125,7 +125,7 @@ body:             |
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<4 x s32>) = G_FMAXIMUM [[BUILD_VECTOR]], [[BITCAST]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<4 x s32>) = G_FMAXIMUM [[BITCAST]], [[BUILD_VECTOR]]
     ; CHECK-NEXT: $q0 = COPY [[FMAXIMUM]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %1:_(<2 x s64>) = COPY $q0
@@ -150,7 +150,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMAXIMUM [[BUILD_VECTOR]], [[COPY]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMAXIMUM [[COPY]], [[BUILD_VECTOR]]
     ; CHECK-NEXT: $q0 = COPY [[FMAXIMUM]](<2 x s64>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<2 x s64>) = COPY $q0
@@ -174,7 +174,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
-    ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMINIMUM [[BUILD_VECTOR]], [[COPY]]
+    ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMINIMUM [[COPY]], [[BUILD_VECTOR]]
     ; CHECK-NEXT: $q0 = COPY [[FMINIMUM]](<2 x s64>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<2 x s64>) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll b/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll
index 7badf4732fd0d4..ae0a9b1c7c4f1f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll
@@ -4,7 +4,7 @@ define half @test_s16(half %a) #0 {
 ; CHECK-LABEL: test_s16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    fmax h0, h1, h0
+; CHECK-NEXT:    fmax h0, h0, h1
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt half %a, 0.0
@@ -16,7 +16,7 @@ define float @test_s32(float %a) #0 {
 ; CHECK-LABEL: test_s32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    fmax s0, s1, s0
+; CHECK-NEXT:    fmax s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt float %a, 0.0
@@ -28,7 +28,7 @@ define double @test_s64(double %a) #0 {
 ; CHECK-LABEL: test_s64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    fmax d0, d1, d0
+; CHECK-NEXT:    fmax d0, d0, d1
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt double %a, 0.0
@@ -40,7 +40,7 @@ define <4 x half> @test_v4s16(<4 x half> %a) #0 {
 ; CHECK-LABEL: test_v4s16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    fmax v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    fmax v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt <4 x half> %a, zeroinitializer
@@ -52,7 +52,7 @@ define <8 x half> @test_v8s16(<8 x half> %a) #0 {
 ; CHECK-LABEL: test_v8s16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    fmax v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    fmax v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt <8 x half> %a, zeroinitializer
@@ -64,7 +64,7 @@ define <2 x float> @test_v2s32(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_v2s32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    fmax v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    fmax v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt <2 x float> %a, zeroinitializer
@@ -76,7 +76,7 @@ define <4 x float> @test_v4s32(<4 x float> %a) #0 {
 ; CHECK-LABEL: test_v4s32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    fmax v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fmax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt <4 x float> %a, zeroinitializer
@@ -88,7 +88,7 @@ define <2 x double> @test_v2s64(<2 x double> %a) #0 {
 ; CHECK-LABEL: test_v2s64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    fmax v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    fmax v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt <2 x double> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
index ee0e83c5e07632..020761352148f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
@@ -254,8 +254,8 @@ body: |
     ; CHECK-NEXT: %one_s32:_(s32) = G_ANYEXT %one(s16)
     ; CHECK-NEXT: %one_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %one_s32(s32), %undef(s32)
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat
-    ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef, [[FMUL]]
-    ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %one_undef, [[FMAXNUM_IEEE]]
+    ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FMUL]], %zero_undef
+    ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FMAXNUM_IEEE]], %one_undef
     ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %two:_(s16) = G_FCONSTANT half 0xH4000
@@ -306,7 +306,7 @@ body: |
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat
     ; CHECK-NEXT: %snan_undef_fcan:_(<2 x s16>) = G_FCANONICALIZE %snan_undef
     ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %snan_undef_fcan, [[FMUL]]
-    ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %qnan_undef, [[FMAXNUM_IEEE]]
+    ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FMAXNUM_IEEE]], %qnan_undef
     ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %two:_(s16) = G_FCONSTANT half 0xH4000
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
index d6321dae3aa7e5..67e6de1ce76449 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
@@ -318,7 +318,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %val:_(s32) = COPY $vgpr4
     ; CHECK-NEXT: %k255:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: %umin0:_(s32) = G_UMIN %k255, %val
+    ; CHECK-NEXT: %umin0:_(s32) = G_UMIN %val, %k255
     ; CHECK-NEXT: $vgpr0 = COPY %umin0(s32)
     %ptr0:_(p1) = COPY $vgpr0_vgpr1
     %ptr1:_(p1) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
index dc13dee4f148ac..1d94d76da148f7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
@@ -145,10 +145,10 @@ define <2 x i16> @test_max_K0min_K1Val__v2i16(<2 x i16> %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 17
 ; GFX8-NEXT:    v_min_i16_e32 v1, 17, v0
-; GFX8-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v2, -12
 ; GFX8-NEXT:    v_max_i16_e32 v1, -12, v1
-; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
index 7e38762e7b559c..a8233054db9bc6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
@@ -145,10 +145,10 @@ define <2 x i16> @test_max_K0min_K1Val__v2u16(<2 x i16> %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 17
 ; GFX8-NEXT:    v_min_u16_e32 v1, 17, v0
-; GFX8-NEXT:    v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 12
 ; GFX8-NEXT:    v_max_u16_e32 v1, 12, v1
-; GFX8-NEXT:    v_max_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 07480a0ce0c2e7..cc0f7e2ca5a54c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -983,7 +983,7 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
 ; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v6, v3
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v7
@@ -1010,7 +1010,7 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v5, v4, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v6
@@ -1058,7 +1058,7 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v2
-; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v2
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
@@ -1265,10 +1265,10 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v9, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v4
@@ -1339,7 +1339,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v10, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_mul_lo_u32 v10, v6, v9
@@ -1366,7 +1366,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v9, v5, v7
 ; CGP-NEXT:    v_mul_lo_u32 v7, v6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, v6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
@@ -1433,10 +1433,10 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v4, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v7, v4
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_mul_lo_u32 v11, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v5, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; CGP-NEXT:    v_mul_lo_u32 v8, v8, v4
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index e61317734bd0d4..aecc6a80fcc181 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -38280,9 +38280,8 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s35, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GFX10-NEXT:    v_and_b32_e32 v30, 1, v30
 ; GFX10-NEXT:    v_and_b32_e32 v28, 1, v28
@@ -38377,10 +38376,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v13
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v11
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v7
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v5
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_hi, 1, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v9
 ; GFX10-NEXT:    s_waitcnt vmcnt(32)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
 ; GFX10-NEXT:    s_waitcnt vmcnt(31)
@@ -38460,12 +38459,12 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v29, v39, s27
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v28, v26, s28
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v51, v20, s29
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v14, v12, s31
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v55, v16, s30
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v53, v18, s34
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, v24, v22, s35
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v14, v12, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v55, v16, vcc_hi
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v53, v18, s31
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v24, v22, s34
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, v4, v3, s4
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v64, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v54, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v52, 0x5040100
@@ -38482,8 +38481,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_perm_b32 v13, v66, v13, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v14, v65, v17, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v15, v16, v15, 0x5040100
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 37412ac3aa5418..99755133f36d6a 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8,GFX678,ALL
 ; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10,GFX1011,ALL
 ; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX1011,ALL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940,ALL
 
 ; ALL-LABEL: {{^}}build_vector2:
 ; R600: MOV
@@ -96,3 +97,99 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
   store <2 x i16> %ins.1, ptr addrspace(1) %out
   ret void
 }
+
+; R600-LABEL: build_v2i32_from_v4i16_shuffle:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 0, @10, KC0[], KC1[]
+; R600-NEXT:    TEX 1 @6
+; R600-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    Fetch clause starting at 6:
+; R600-NEXT:     VTX_READ_16 T1.X, T0.X, 48, #3
+; R600-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
+; R600-NEXT:    ALU clause starting at 10:
+; R600-NEXT:     MOV * T0.X, 0.0,
+; R600-NEXT:    ALU clause starting at 11:
+; R600-NEXT:     LSHL * T0.Y, T1.X, literal.x,
+; R600-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT:     LSHL T0.X, T0.X, literal.x,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT:    16(2.242078e-44), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX940-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT:    s_endpgm
+define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) {
+entry:
+  %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %zextended = zext <2 x i16> %shuf to <2 x i32>
+  %shifted = shl <2 x i32> %zextended, <i32 16, i32 16>
+  store <2 x i32> %shifted, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index ec3c08ec795235..da64c379672ef7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -1259,17 +1259,17 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX10SELDAG-LABEL: isnan_v4f16:
 ; GFX10SELDAG:       ; %bb.0:
 ; GFX10SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10SELDAG-NEXT:    v_mov_b32_e32 v2, 3
-; GFX10SELDAG-NEXT:    v_cmp_class_f16_e64 s5, v0, 3
-; GFX10SELDAG-NEXT:    v_cmp_class_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10SELDAG-NEXT:    v_cmp_class_f16_sdwa s5, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT:    v_cmp_class_f16_e64 s4, v0, 3
+; GFX10SELDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10SELDAG-NEXT:    v_cmp_class_f16_e64 s4, v1, 3
+; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10SELDAG-NEXT:    v_cmp_class_f16_sdwa s4, v0, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10SELDAG-NEXT:    v_cmp_class_f16_sdwa s4, v1, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX10SELDAG-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
-; GFX10SELDAG-NEXT:    v_cmp_class_f16_e64 s5, v1, 3
-; GFX10SELDAG-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
 ; GFX10SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10GLISEL-LABEL: isnan_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index ab6a9dcf71acef..a87fa8bf36d9e7 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -7404,35 +7404,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35
 ; GFX12-NEXT:    v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x100000
-; GFX12-NEXT:    s_lshr_b32 s12, s0, 16
-; GFX12-NEXT:    s_mov_b32 s14, s1
-; GFX12-NEXT:    s_lshr_b32 s16, s1, 16
-; GFX12-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x100000
+; GFX12-NEXT:    s_bfe_i64 s[16:17], s[2:3], 0x100000
 ; GFX12-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25
+; GFX12-NEXT:    s_mov_b32 s12, s1
+; GFX12-NEXT:    s_lshr_b32 s14, s1, 16
 ; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x100000
 ; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x100000
 ; GFX12-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
 ; GFX12-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x100000
+; GFX12-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19
+; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
 ; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
 ; GFX12-NEXT:    v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21
 ; GFX12-NEXT:    v_mov_b32_e32 v18, s20
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GFX12-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[8:9] offset:80
 ; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[8:9] offset:64
-; GFX12-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT:    v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
-; GFX12-NEXT:    v_dual_mov_b32 v9, s15 :: v_dual_mov_b32 v8, s14
-; GFX12-NEXT:    v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16
+; GFX12-NEXT:    v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12
+; GFX12-NEXT:    v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v10, s14
 ; GFX12-NEXT:    v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10
-; GFX12-NEXT:    v_dual_mov_b32 v23, s13 :: v_dual_mov_b32 v22, s12
+; GFX12-NEXT:    v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0
 ; GFX12-NEXT:    s_clause 0x5
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[8:9] offset:112
 ; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[8:9] offset:96
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 952827b8cd0e71..889755c23bbc72 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -8808,73 +8808,73 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s6
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s5
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT:    s_lshr_b32 s24, s7, 16
+; GFX12-NEXT:    s_lshr_b32 s22, s7, 16
 ; GFX12-NEXT:    v_bfe_i32 v31, v1, 0, 8
-; GFX12-NEXT:    s_lshr_b32 s42, s2, 24
-; GFX12-NEXT:    s_mov_b32 s48, s7
+; GFX12-NEXT:    s_lshr_b32 s40, s2, 24
+; GFX12-NEXT:    s_mov_b32 s46, s7
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s1
-; GFX12-NEXT:    s_lshr_b32 s26, s6, 16
-; GFX12-NEXT:    s_lshr_b32 s44, s1, 16
+; GFX12-NEXT:    s_lshr_b32 s24, s6, 16
+; GFX12-NEXT:    s_lshr_b32 s42, s1, 16
 ; GFX12-NEXT:    s_ashr_i64 s[58:59], s[6:7], 56
-; GFX12-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s24
-; GFX12-NEXT:    s_lshr_b32 s28, s6, 24
-; GFX12-NEXT:    s_lshr_b32 s30, s5, 16
-; GFX12-NEXT:    s_lshr_b32 s40, s2, 16
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s22
+; GFX12-NEXT:    s_lshr_b32 s26, s6, 24
+; GFX12-NEXT:    s_lshr_b32 s28, s5, 16
+; GFX12-NEXT:    s_lshr_b32 s38, s2, 16
 ; GFX12-NEXT:    v_bfe_i32 v11, v8, 0, 8
 ; GFX12-NEXT:    v_bfe_i32 v23, v4, 0, 8
 ; GFX12-NEXT:    v_bfe_i32 v27, v2, 0, 8
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v32, 31, v31
-; GFX12-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v34, s25 :: v_dual_mov_b32 v35, s58
-; GFX12-NEXT:    v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s26
-; GFX12-NEXT:    v_dual_mov_b32 v56, s43 :: v_dual_mov_b32 v29, s48
-; GFX12-NEXT:    v_mov_b32_e32 v30, s49
-; GFX12-NEXT:    s_lshr_b32 s46, s0, 24
-; GFX12-NEXT:    s_mov_b32 s50, s5
-; GFX12-NEXT:    s_mov_b32 s52, s3
-; GFX12-NEXT:    s_lshr_b32 s34, s4, 16
-; GFX12-NEXT:    s_lshr_b32 s36, s4, 24
-; GFX12-NEXT:    s_ashr_i64 s[22:23], s[2:3], 56
+; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s58
+; GFX12-NEXT:    v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s24
+; GFX12-NEXT:    v_dual_mov_b32 v56, s41 :: v_dual_mov_b32 v29, s46
+; GFX12-NEXT:    v_mov_b32_e32 v30, s47
+; GFX12-NEXT:    s_lshr_b32 s44, s0, 24
+; GFX12-NEXT:    s_mov_b32 s48, s5
+; GFX12-NEXT:    s_mov_b32 s50, s3
+; GFX12-NEXT:    s_lshr_b32 s30, s4, 16
+; GFX12-NEXT:    s_lshr_b32 s34, s4, 24
+; GFX12-NEXT:    s_ashr_i64 s[54:55], s[2:3], 56
 ; GFX12-NEXT:    s_ashr_i64 s[56:57], s[4:5], 56
 ; GFX12-NEXT:    v_bfe_i32 v7, v7, 0, 8
 ; GFX12-NEXT:    v_bfe_i32 v19, v5, 0, 8
-; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX12-NEXT:    s_lshr_b32 s38, s3, 16
-; GFX12-NEXT:    s_mov_b32 s54, s1
+; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX12-NEXT:    s_lshr_b32 s36, s3, 16
+; GFX12-NEXT:    s_mov_b32 s52, s1
 ; GFX12-NEXT:    s_bfe_i64 s[12:13], s[2:3], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[14:15], s[4:5], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[16:17], s[6:7], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[2:3], s[52:53], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[4:5], s[50:51], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[6:7], s[46:47], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[2:3], s[50:51], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[4:5], s[48:49], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[6:7], s[44:45], 0x80000
 ; GFX12-NEXT:    s_lshr_b32 s20, s0, 16
 ; GFX12-NEXT:    s_ashr_i64 s[18:19], s[0:1], 56
 ; GFX12-NEXT:    v_bfe_i32 v3, v3, 0, 8
 ; GFX12-NEXT:    v_bfe_i32 v15, v6, 0, 8
-; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v38, s27 :: v_dual_mov_b32 v39, s28
-; GFX12-NEXT:    v_dual_mov_b32 v40, s29 :: v_dual_mov_b32 v41, s30
-; GFX12-NEXT:    v_dual_mov_b32 v42, s31 :: v_dual_mov_b32 v43, s56
-; GFX12-NEXT:    v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s34
-; GFX12-NEXT:    v_dual_mov_b32 v52, s23 :: v_dual_mov_b32 v53, s40
-; GFX12-NEXT:    v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s42
+; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v38, s25 :: v_dual_mov_b32 v39, s26
+; GFX12-NEXT:    v_dual_mov_b32 v40, s27 :: v_dual_mov_b32 v41, s28
+; GFX12-NEXT:    v_dual_mov_b32 v42, s29 :: v_dual_mov_b32 v43, s56
+; GFX12-NEXT:    v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s30
+; GFX12-NEXT:    v_dual_mov_b32 v52, s55 :: v_dual_mov_b32 v53, s38
+; GFX12-NEXT:    v_dual_mov_b32 v54, s39 :: v_dual_mov_b32 v55, s40
 ; GFX12-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[0:1], s[54:55], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[0:1], s[52:53], 0x80000
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
 ; GFX12-NEXT:    global_store_b128 v0, v[33:36], s[8:9] offset:240
-; GFX12-NEXT:    v_mov_b32_e32 v33, s44
+; GFX12-NEXT:    v_mov_b32_e32 v33, s42
 ; GFX12-NEXT:    global_store_b128 v0, v[29:32], s[8:9] offset:224
 ; GFX12-NEXT:    v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17
 ; GFX12-NEXT:    v_dual_mov_b32 v32, s7 :: v_dual_mov_b32 v21, s4
@@ -8882,16 +8882,16 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v9, s12
 ; GFX12-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s0
 ; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v46, s35 :: v_dual_mov_b32 v47, s36
-; GFX12-NEXT:    v_dual_mov_b32 v48, s37 :: v_dual_mov_b32 v49, s38
-; GFX12-NEXT:    v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s18
+; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v46, s31 :: v_dual_mov_b32 v47, s34
+; GFX12-NEXT:    v_dual_mov_b32 v48, s35 :: v_dual_mov_b32 v49, s36
+; GFX12-NEXT:    v_dual_mov_b32 v34, s43 :: v_dual_mov_b32 v35, s18
 ; GFX12-NEXT:    v_dual_mov_b32 v36, s19 :: v_dual_mov_b32 v29, s20
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
 ; GFX12-NEXT:    v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v13, s2
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v1, s10
-; GFX12-NEXT:    v_dual_mov_b32 v50, s39 :: v_dual_mov_b32 v51, s22
+; GFX12-NEXT:    v_dual_mov_b32 v50, s37 :: v_dual_mov_b32 v51, s54
 ; GFX12-NEXT:    v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s6
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 383dcced47c350..36785137787466 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,7 +28,7 @@ body:             |
   ; GCN-LABEL: name: test_main
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
+  ; GCN-NEXT:   liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x35, 0x24, 0x36, 0xe9, 0x02
   ; GCN-NEXT:   frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30_lo16, 32, $sgpr31_lo16, 32
@@ -73,9 +73,10 @@ body:             |
   ; GCN-NEXT:   frame-setup CFI_INSTRUCTION undefined $sgpr61_lo16
   ; GCN-NEXT:   frame-setup CFI_INSTRUCTION undefined $sgpr62_lo16
   ; GCN-NEXT:   frame-setup CFI_INSTRUCTION undefined $sgpr63_lo16
-  ; GCN-NEXT:   $sgpr0 = COPY $sgpr33
+  ; GCN-NEXT:   $vcc_hi = frame-setup COPY $sgpr33
+  ; GCN-NEXT:   frame-setup CFI_INSTRUCTION register $sgpr33_lo16, <badreg>
   ; GCN-NEXT:   $sgpr33 = frame-setup COPY $sgpr32
-  ; GCN-NEXT:   $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.68, addrspace 5)
   ; GCN-NEXT:   frame-setup CFI_INSTRUCTION offset $vgpr3_lo16, 0
   ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
@@ -84,11 +85,9 @@ body:             |
   ; GCN-NEXT:   frame-setup CFI_INSTRUCTION offset $vgpr5_lo16, 256
   ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
   ; GCN-NEXT:   frame-setup CFI_INSTRUCTION offset $vgpr2_lo16, 384
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
   ; GCN-NEXT:   frame-setup CFI_INSTRUCTION offset $vgpr1_lo16, 512
-  ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr1
-  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr5
-  ; GCN-NEXT:   frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr33_lo16, $vgpr5_lo16, 4, 32
+  ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x41, 0x94, 0x04, 0x35, 0x24, 0x36, 0xe9, 0x02
   ; GCN-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
   ; GCN-NEXT:   renamable $vgpr2 = IMPLICIT_DEF
@@ -233,18 +232,18 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   KILL implicit-def $vcc_lo, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 2, implicit-def $sgpr30_sgpr31
   ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 3
@@ -315,17 +314,16 @@ body:             |
   ; GCN-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
   ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
   ; GCN-NEXT:   KILL killed renamable $vgpr2
-  ; GCN-NEXT:   $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 4
-  ; GCN-NEXT:   $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.68, addrspace 5)
   ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
   ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
   ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
-  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5)
-  ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr1
+  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
+  ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc
   ; GCN-NEXT:   frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x35, 0x24, 0x36, 0xe9, 0x02
-  ; GCN-NEXT:   $sgpr33 = COPY $sgpr0
+  ; GCN-NEXT:   $sgpr33 = frame-destroy COPY $vcc_hi
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
index 63bef40c34742f..b8ac50c3aeb5e8 100644
--- a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
@@ -160,7 +160,7 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $sgpr20 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
-    ; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX11-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX11-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
@@ -174,7 +174,7 @@ body:             |
     ; PAIR-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $sgpr20 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx12 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx12 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX12-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX12-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
@@ -458,9 +458,9 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX11-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX11-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX11-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
     ; PAIR-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -476,9 +476,9 @@ body:             |
     ; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX12-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
     ; PAIR-GFX12-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -559,12 +559,12 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
@@ -586,12 +586,12 @@ body:             |
     ; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx12 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx12 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.ll b/llvm/test/CodeGen/RISCV/prefer-w-inst.ll
new file mode 100644
index 00000000000000..34ab74d78a76fb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=NO-PREFER-W-INST %s
+; RUN: llc -mtriple=riscv64 -mattr=+m -riscv-disable-strip-w-suffix -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=NO-STRIP %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+prefer-w-inst -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=PREFER-W-INST %s
+
+define i32 @addiw(i32 %a) {
+; NO-PREFER-W-INST-LABEL: addiw:
+; NO-PREFER-W-INST:       # %bb.0:
+; NO-PREFER-W-INST-NEXT:    lui a1, 1
+; NO-PREFER-W-INST-NEXT:    addi a1, a1, -1
+; NO-PREFER-W-INST-NEXT:    addw a0, a0, a1
+; NO-PREFER-W-INST-NEXT:    ret
+;
+; NO-STRIP-LABEL: addiw:
+; NO-STRIP:       # %bb.0:
+; NO-STRIP-NEXT:    lui a1, 1
+; NO-STRIP-NEXT:    addiw a1, a1, -1
+; NO-STRIP-NEXT:    addw a0, a0, a1
+; NO-STRIP-NEXT:    ret
+;
+; PREFER-W-INST-LABEL: addiw:
+; PREFER-W-INST:       # %bb.0:
+; PREFER-W-INST-NEXT:    lui a1, 1
+; PREFER-W-INST-NEXT:    addiw a1, a1, -1
+; PREFER-W-INST-NEXT:    addw a0, a0, a1
+; PREFER-W-INST-NEXT:    ret
+  %ret = add i32 %a, 4095
+  ret i32 %ret
+}
+
+define i32 @addw(i32 %a, i32 %b) {
+; NO-PREFER-W-INST-LABEL: addw:
+; NO-PREFER-W-INST:       # %bb.0:
+; NO-PREFER-W-INST-NEXT:    add a0, a0, a1
+; NO-PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; NO-PREFER-W-INST-NEXT:    ret
+;
+; NO-STRIP-LABEL: addw:
+; NO-STRIP:       # %bb.0:
+; NO-STRIP-NEXT:    addw a0, a0, a1
+; NO-STRIP-NEXT:    addiw a0, a0, 1024
+; NO-STRIP-NEXT:    ret
+;
+; PREFER-W-INST-LABEL: addw:
+; PREFER-W-INST:       # %bb.0:
+; PREFER-W-INST-NEXT:    addw a0, a0, a1
+; PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; PREFER-W-INST-NEXT:    ret
+  %add = add i32 %a, %b
+  %ret = add i32 %add, 1024
+  ret i32 %ret
+}
+
+define i32 @mulw(i32 %a, i32 %b) {
+; NO-PREFER-W-INST-LABEL: mulw:
+; NO-PREFER-W-INST:       # %bb.0:
+; NO-PREFER-W-INST-NEXT:    mul a1, a0, a1
+; NO-PREFER-W-INST-NEXT:    mul a0, a0, a1
+; NO-PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; NO-PREFER-W-INST-NEXT:    ret
+;
+; NO-STRIP-LABEL: mulw:
+; NO-STRIP:       # %bb.0:
+; NO-STRIP-NEXT:    mulw a1, a0, a1
+; NO-STRIP-NEXT:    mulw a0, a0, a1
+; NO-STRIP-NEXT:    addiw a0, a0, 1024
+; NO-STRIP-NEXT:    ret
+;
+; PREFER-W-INST-LABEL: mulw:
+; PREFER-W-INST:       # %bb.0:
+; PREFER-W-INST-NEXT:    mulw a1, a0, a1
+; PREFER-W-INST-NEXT:    mulw a0, a0, a1
+; PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; PREFER-W-INST-NEXT:    ret
+  %mul1 = mul i32 %a, %b
+  %mul = mul i32 %a, %mul1
+  %ret = add i32 %mul, 1024
+  ret i32 %ret
+}
+
+define i32 @slliw(i32 %a) {
+; NO-PREFER-W-INST-LABEL: slliw:
+; NO-PREFER-W-INST:       # %bb.0:
+; NO-PREFER-W-INST-NEXT:    slli a0, a0, 1
+; NO-PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; NO-PREFER-W-INST-NEXT:    ret
+;
+; NO-STRIP-LABEL: slliw:
+; NO-STRIP:       # %bb.0:
+; NO-STRIP-NEXT:    slliw a0, a0, 1
+; NO-STRIP-NEXT:    addiw a0, a0, 1024
+; NO-STRIP-NEXT:    ret
+;
+; PREFER-W-INST-LABEL: slliw:
+; PREFER-W-INST:       # %bb.0:
+; PREFER-W-INST-NEXT:    slliw a0, a0, 1
+; PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; PREFER-W-INST-NEXT:    ret
+  %shl = shl i32 %a, 1
+  %ret = add i32 %shl, 1024
+  ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
new file mode 100644
index 00000000000000..e05e27af4271c1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
@@ -0,0 +1,262 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc %s -mtriple=riscv64 -run-pass=riscv-opt-w-instrs -verify-machineinstrs \
+# RUN:   -mattr=+m -o - | FileCheck %s -check-prefixes=NO-PREFER-W-INST
+# RUN: llc %s -mtriple=riscv64 -run-pass=riscv-opt-w-instrs -verify-machineinstrs \
+# RUN:   -mattr=+m,+prefer-w-inst -o - | FileCheck %s -check-prefixes=PREFER-W-INST
+
+---
+name: addi
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: addi
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY]], 1
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADDI]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: addi
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[COPY]], 1
+    ; PREFER-W-INST-NEXT: [[ADDIW1:%[0-9]+]]:gpr = ADDIW [[ADDIW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW1]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = ADDI %1, 1
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: add
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: add
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[ADD:%[0-9]+]]:gpr = ADD [[COPY]], [[COPY1]]
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADD]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: add
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[ADDW:%[0-9]+]]:gpr = ADDW [[COPY]], [[COPY1]]
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADDW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = ADD %1, %2
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: sub
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: sub
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[SUB:%[0-9]+]]:gpr = SUB [[COPY]], [[COPY1]]
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SUB]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: sub
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[SUBW:%[0-9]+]]:gpr = SUBW [[COPY]], [[COPY1]]
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SUBW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = SUB %1, %2
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: mul
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: mul
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[MUL:%[0-9]+]]:gpr = MUL [[COPY]], [[COPY1]]
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[MUL]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: mul
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[MULW:%[0-9]+]]:gpr = MULW [[COPY]], [[COPY1]]
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[MULW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = MUL %1, %2
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+
+---
+name: slli_31
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: slli_31
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 31
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: slli_31
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[SLLIW:%[0-9]+]]:gpr = SLLIW [[COPY]], 31
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLIW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = SLLI %1, 31
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: slli_32
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: slli_32
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: slli_32
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = SLLI %1, 32
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: ld
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: ld
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[LD:%[0-9]+]]:gpr = LD [[COPY]], 0
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LD]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: ld
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = LD %1, 0
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: lwu
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: lwu
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[LWU:%[0-9]+]]:gpr = LWU [[COPY]], 0
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LWU]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: lwu
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = LWU %1, 0
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr88799.ll b/llvm/test/CodeGen/RISCV/rvv/pr88799.ll
new file mode 100644
index 00000000000000..7212a789f9e7e0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/pr88799.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64-unknown-linux-gnu -mattr=+v | FileCheck %s
+
+define i32 @main() vscale_range(2,2) {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %vector.body
+; CHECK-NEXT:    lui a0, 1040368
+; CHECK-NEXT:    addiw a0, a0, -144
+; CHECK-NEXT:    vl2re16.v v8, (a0)
+; CHECK-NEXT:    vs2r.v v8, (zero)
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+vector.body:
+  %0 = load <16 x i16>, ptr getelementptr ([3 x [23 x [23 x i16]]], ptr null, i64 -10593, i64 1, i64 22, i64 0), align 16
+  store <16 x i16> %0, ptr null, align 2
+  %wide.load = load <vscale x 8 x i16>, ptr getelementptr ([3 x [23 x [23 x i16]]], ptr null, i64 -10593, i64 1, i64 22, i64 0), align 16
+  store <vscale x 8 x i16> %wide.load, ptr null, align 2
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/RISCV/strip-w-suffix.ll b/llvm/test/CodeGen/RISCV/strip-w-suffix.ll
deleted file mode 100644
index 4124b3d0d360d2..00000000000000
--- a/llvm/test/CodeGen/RISCV/strip-w-suffix.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefixes=STRIP %s
-; RUN: llc -mtriple=riscv64 -mattr=+m,+no-strip-w-suffix -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefixes=NO-STRIP %s
-
-define i32 @addiw(i32 %a) {
-; STRIP-LABEL: addiw:
-; STRIP:       # %bb.0:
-; STRIP-NEXT:    lui a1, 1
-; STRIP-NEXT:    addi a1, a1, -1
-; STRIP-NEXT:    addw a0, a0, a1
-; STRIP-NEXT:    ret
-;
-; NO-STRIP-LABEL: addiw:
-; NO-STRIP:       # %bb.0:
-; NO-STRIP-NEXT:    lui a1, 1
-; NO-STRIP-NEXT:    addiw a1, a1, -1
-; NO-STRIP-NEXT:    addw a0, a0, a1
-; NO-STRIP-NEXT:    ret
-  %ret = add i32 %a, 4095
-  ret i32 %ret
-}
-
-define i32 @addw(i32 %a, i32 %b) {
-; STRIP-LABEL: addw:
-; STRIP:       # %bb.0:
-; STRIP-NEXT:    add a0, a0, a1
-; STRIP-NEXT:    addiw a0, a0, 1024
-; STRIP-NEXT:    ret
-;
-; NO-STRIP-LABEL: addw:
-; NO-STRIP:       # %bb.0:
-; NO-STRIP-NEXT:    addw a0, a0, a1
-; NO-STRIP-NEXT:    addiw a0, a0, 1024
-; NO-STRIP-NEXT:    ret
-  %add = add i32 %a, %b
-  %ret = add i32 %add, 1024
-  ret i32 %ret
-}
-
-define i32 @mulw(i32 %a, i32 %b) {
-; STRIP-LABEL: mulw:
-; STRIP:       # %bb.0:
-; STRIP-NEXT:    mul a0, a0, a1
-; STRIP-NEXT:    addiw a0, a0, 1024
-; STRIP-NEXT:    ret
-;
-; NO-STRIP-LABEL: mulw:
-; NO-STRIP:       # %bb.0:
-; NO-STRIP-NEXT:    mulw a0, a0, a1
-; NO-STRIP-NEXT:    addiw a0, a0, 1024
-; NO-STRIP-NEXT:    ret
-  %mul = mul i32 %a, %b
-  %ret = add i32 %mul, 1024
-  ret i32 %ret
-}
-
-define i32 @slliw(i32 %a) {
-; STRIP-LABEL: slliw:
-; STRIP:       # %bb.0:
-; STRIP-NEXT:    slli a0, a0, 1
-; STRIP-NEXT:    addiw a0, a0, 1024
-; STRIP-NEXT:    ret
-;
-; NO-STRIP-LABEL: slliw:
-; NO-STRIP:       # %bb.0:
-; NO-STRIP-NEXT:    slliw a0, a0, 1
-; NO-STRIP-NEXT:    addiw a0, a0, 1024
-; NO-STRIP-NEXT:    ret
-  %shl = shl i32 %a, 1
-  %ret = add i32 %shl, 1024
-  ret i32 %ret
-}
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
index 18e0ede5e29375..0e27d6301bb3cd 100644
--- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
@@ -28,7 +28,7 @@
 
 # RELAX:       Relocations [
 # RELAX-NEXT:    Section ({{.*}}) .rela.text {
-# RELAX-NEXT:      0x4 R_LARCH_ALIGN {{.*}} 0x4
+# RELAX-NEXT:      0x4 R_LARCH_ALIGN .text 0x4
 # RELAX-NEXT:      0x10 R_LARCH_PCALA_HI20 .L1 0x0
 # RELAX-NEXT:      0x10 R_LARCH_RELAX - 0x0
 # RELAX-NEXT:      0x14 R_LARCH_PCALA_LO12 .L1 0x0
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-align.s b/llvm/test/MC/LoongArch/Relocations/relax-align.s
index 294fd9fb916c75..0246d5b46431c9 100644
--- a/llvm/test/MC/LoongArch/Relocations/relax-align.s
+++ b/llvm/test/MC/LoongArch/Relocations/relax-align.s
@@ -63,17 +63,19 @@ ret
 ## Test the symbol index is different from .text.
 .section .text2, "ax"
 .p2align 4
+.p2align 4, , 4
 break 7
 
 # RELOC:            Relocations [
 # RELAX-RELOC-NEXT:   Section ({{.*}}) .rela.text {
-# RELAX-RELOC-NEXT:     0x24 R_LARCH_ALIGN .Lla-relax-align0 0x4
-# RELAX-RELOC-NEXT:     0x34 R_LARCH_ALIGN .Lla-relax-align0 0x5
-# RELAX-RELOC-NEXT:     0x50 R_LARCH_ALIGN .Lla-relax-align0 0x4
-# RELAX-RELOC-NEXT:     0x60 R_LARCH_ALIGN .Lla-relax-align0 0xB04
-# RELAX-RELOC-NEXT:     0x70 R_LARCH_ALIGN .Lla-relax-align0 0x4
+# RELAX-RELOC-NEXT:     0x24 R_LARCH_ALIGN .text 0x4
+# RELAX-RELOC-NEXT:     0x34 R_LARCH_ALIGN .text 0x5
+# RELAX-RELOC-NEXT:     0x50 R_LARCH_ALIGN .text 0x4
+# RELAX-RELOC-NEXT:     0x60 R_LARCH_ALIGN .text 0xB04
+# RELAX-RELOC-NEXT:     0x70 R_LARCH_ALIGN .text 0x4
 # RELAX-RELOC-NEXT:   }
 # RELAX-RELOC-NEXT:   Section ({{.*}}) .rela.text2 {
-# RELAX-RELOC-NEXT:     0x0 R_LARCH_ALIGN .Lla-relax-align1 0x4
+# RELAX-RELOC-NEXT:     0x0 R_LARCH_ALIGN .text2 0x4
+# RELAX-RELOC-NEXT:     0xC R_LARCH_ALIGN .text2 0x404
 # RELAX-RELOC-NEXT:   }
 # RELOC-NEXT:       ]
diff --git a/llvm/test/MC/RISCV/rv32zcmop-invalid.s b/llvm/test/MC/RISCV/rv32zcmop-invalid.s
index 71d72d59b02092..fb6252f7f0760c 100644
--- a/llvm/test/MC/RISCV/rv32zcmop-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zcmop-invalid.s
@@ -1,7 +1,7 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+zcmop < %s 2>&1 | FileCheck %s
 
-cmop.0 # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
+c.mop.0 # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
 
-cmop.1 t0 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction
+c.mop.1 t0 # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
 
-cmop.1 0x0 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction
+c.mop.1 0x0 # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rvzcmop-valid.s b/llvm/test/MC/RISCV/rvzcmop-valid.s
index c6bb4a15808258..dd5d26ac5dd0cd 100644
--- a/llvm/test/MC/RISCV/rvzcmop-valid.s
+++ b/llvm/test/MC/RISCV/rvzcmop-valid.s
@@ -9,34 +9,34 @@
 # RUN:     | llvm-objdump --mattr=+zcmop -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 
-# CHECK-ASM-AND-OBJ: cmop.1
+# CHECK-ASM-AND-OBJ: c.mop.1
 # CHECK-ASM: encoding: [0x81,0x60]
-cmop.1
+c.mop.1
 
-# CHECK-ASM-AND-OBJ: cmop.3
+# CHECK-ASM-AND-OBJ: c.mop.3
 # CHECK-ASM: encoding: [0x81,0x61]
-cmop.3
+c.mop.3
 
-# CHECK-ASM-AND-OBJ: cmop.5
+# CHECK-ASM-AND-OBJ: c.mop.5
 # CHECK-ASM: encoding: [0x81,0x62]
-cmop.5
+c.mop.5
 
-# CHECK-ASM-AND-OBJ: cmop.7
+# CHECK-ASM-AND-OBJ: c.mop.7
 # CHECK-ASM: encoding: [0x81,0x63]
-cmop.7
+c.mop.7
 
-# CHECK-ASM-AND-OBJ: cmop.9
+# CHECK-ASM-AND-OBJ: c.mop.9
 # CHECK-ASM: encoding: [0x81,0x64]
-cmop.9
+c.mop.9
 
-# CHECK-ASM-AND-OBJ: cmop.11
+# CHECK-ASM-AND-OBJ: c.mop.11
 # CHECK-ASM: encoding: [0x81,0x65]
-cmop.11
+c.mop.11
 
-# CHECK-ASM-AND-OBJ: cmop.13
+# CHECK-ASM-AND-OBJ: c.mop.13
 # CHECK-ASM: encoding: [0x81,0x66]
-cmop.13
+c.mop.13
 
-# CHECK-ASM-AND-OBJ: cmop.15
+# CHECK-ASM-AND-OBJ: c.mop.15
 # CHECK-ASM: encoding: [0x81,0x67]
-cmop.15
+c.mop.15
diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll
index f2701d16d0f3d1..069512b0f2d8eb 100644
--- a/llvm/test/Transforms/InstCombine/fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp.ll
@@ -1284,3 +1284,205 @@ define <1 x i1> @bitcast_1vec_eq0(i32 %x) {
   %cmp = fcmp oeq <1 x float> %f, zeroinitializer
   ret <1 x i1> %cmp
 }
+
+; Simplify fcmp (x + 0.0), y => fcmp x, y
+
+define i1 @fcmp_fadd_zero_ugt(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ugt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_uge(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_uge(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp uge float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ogt(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ogt(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ogt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_oge(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_oge(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp oge float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ult(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ult(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ult float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ule(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ule(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ule float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_olt(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_olt(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp olt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ole(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ole(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ole float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_oeq(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_oeq(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp oeq float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_one(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_one(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp one float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ueq(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ueq(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ueq float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_une(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_une(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp une float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ord(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ord(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ord float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_uno(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_uno(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp uno float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_neg_zero(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_neg_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, -0.000000e+00
+  %cmp = fcmp ugt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_switched(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_switched(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %y, 0.000000e+00
+  %cmp = fcmp ugt float %x, %add
+  ret i1 %cmp
+}
+
+define <2 x i1> @fcmp_fadd_zero_vec(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %add = fadd <2 x float> %x, <float 0.0, float -0.0>
+  %cmp = fcmp ugt <2 x float> %add, %y
+  ret <2 x i1> %cmp
+}
+
+define i1 @fcmp_fast_fadd_fast_zero(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fast_fadd_fast_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast ugt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd fast float %x, 0.000000e+00
+  %cmp = fcmp fast ugt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fast_fadd_zero(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fast_fadd_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast ugt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp fast ugt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_fast_zero(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_fast_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd fast float %x, 0.000000e+00
+  %cmp = fcmp ugt float %add, %y
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
index ae2e115b1dd9a2..bd1a47bbfcc193 100644
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -2581,3 +2581,92 @@ entry:
   %val = call i8 @llvm.umin.i8(i8 %sub, i8 3)
   ret i8 %val
 }
+
+define i8 @test_umax_and(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_umax_and(
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.umax.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]])
+; CHECK-NEXT:    [[RES1:%.*]] = and i8 [[RES]], -64
+; CHECK-NEXT:    ret i8 [[RES1]]
+;
+  %x1 = and i8 %x, -64
+  %y1 = and i8 %y, -64
+  %res = call i8 @llvm.umax.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_umin_and(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_umin_and(
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.umin.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]])
+; CHECK-NEXT:    [[RES1:%.*]] = and i8 [[RES]], -64
+; CHECK-NEXT:    ret i8 [[RES1]]
+;
+  %x1 = and i8 %x, -64
+  %y1 = and i8 %y, -64
+  %res = call i8 @llvm.umin.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_smax_and(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_smax_and(
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.smax.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]])
+; CHECK-NEXT:    [[RES1:%.*]] = and i8 [[RES]], -64
+; CHECK-NEXT:    ret i8 [[RES1]]
+;
+  %x1 = and i8 %x, -64
+  %y1 = and i8 %y, -64
+  %res = call i8 @llvm.smax.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_smin_and(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_smin_and(
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]])
+; CHECK-NEXT:    [[RES1:%.*]] = and i8 [[RES]], -64
+; CHECK-NEXT:    ret i8 [[RES1]]
+;
+  %x1 = and i8 %x, -64
+  %y1 = and i8 %y, -64
+  %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_smin_and_mismatch(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_smin_and_mismatch(
+; CHECK-NEXT:    [[X1:%.*]] = and i8 [[X:%.*]], -64
+; CHECK-NEXT:    [[Y1:%.*]] = and i8 [[Y:%.*]], -32
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1]], i8 [[Y1]])
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %x1 = and i8 %x, -64
+  %y1 = and i8 %y, -32
+  %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_smin_and_non_negated_pow2(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_smin_and_non_negated_pow2(
+; CHECK-NEXT:    [[X1:%.*]] = and i8 [[X:%.*]], 31
+; CHECK-NEXT:    [[Y1:%.*]] = and i8 [[Y:%.*]], 31
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1]], i8 [[Y1]])
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %x1 = and i8 %x, 31
+  %y1 = and i8 %y, 31
+  %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_smin_and_multiuse(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_smin_and_multiuse(
+; CHECK-NEXT:    [[X1:%.*]] = and i8 [[X:%.*]], 31
+; CHECK-NEXT:    [[Y1:%.*]] = and i8 [[Y:%.*]], 31
+; CHECK-NEXT:    call void @use(i8 [[Y1]])
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1]], i8 [[Y1]])
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %x1 = and i8 %x, 31
+  %y1 = and i8 %y, 31
+  call void @use(i8 %y1)
+  %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
diff --git a/llvm/test/Transforms/SCCP/pr50901.ll b/llvm/test/Transforms/SCCP/pr50901.ll
index 11d6bba6f6a935..d48d67532d88bd 100644
--- a/llvm/test/Transforms/SCCP/pr50901.ll
+++ b/llvm/test/Transforms/SCCP/pr50901.ll
@@ -52,6 +52,16 @@
 ; CHECK:     = !DIGlobalVariableExpression(var: ![[DBG_FLOAT_UNDEF:.+]], expr: !DIExpression())
 ; CHECK-DAG: ![[DBG_FLOAT_UNDEF]]  = distinct !DIGlobalVariable(name: "g_float_undef"
 
+; CHECK: ![[G8:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBG8:[0-9]+]], expr: !DIExpression(DW_OP_constu, 22136, DW_OP_stack_value))
+; CHECK-DAG: ![[DBG8]] = distinct !DIGlobalVariable(name: "g_88", {{.*}}
+; CHECK: ![[G9:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBG9:[0-9]+]], expr: !DIExpression(DW_OP_constu, 23726, DW_OP_stack_value))
+; CHECK-DAG: ![[DBG9]] = distinct !DIGlobalVariable(name: "g_99", {{.*}}
+
+; CHECK-DAG: ![[DBGA:[0-9]+]] = distinct !DIGlobalVariable(name: "g_i32_undef"
+; CHECK-DAG: ![[GA:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBGA]], expr: !DIExpression())
+; CHECK-DAG: ![[DBGB:[0-9]+]] = distinct !DIGlobalVariable(name: "g_ptr_undef"
+; CHECK-DAG: ![[GB:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBGB]], expr: !DIExpression())
+
 @g_1 = dso_local global i32 -4, align 4, !dbg !0
 @g_2 = dso_local global float 0x4011C28F60000000, align 4, !dbg !8
 @g_3 = dso_local global i8 97, align 1, !dbg !10
@@ -59,6 +69,8 @@
 @g_5 = dso_local global i8 1, align 1, !dbg !16
 @g_6 = dso_local global ptr null, align 8, !dbg !19
 @g_7 = dso_local global ptr null, align 8, !dbg !23
+@g_8 = dso_local global half 0xH4321, align 4, !dbg !86
+@g_9 = dso_local global bfloat 0xR3F80, align 4, !dbg !90
 @_ZL4g_11 = internal global i32 -5, align 4, !dbg !25
 @_ZL4g_22 = internal global float 0x4016333340000000, align 4, !dbg !27
 @_ZL4g_33 = internal global i8 98, align 1, !dbg !29
@@ -67,6 +79,10 @@
 @_ZL4g_66 = internal global ptr null, align 8, !dbg !35
 @_ZL4g_77 = internal global ptr inttoptr (i64 70 to ptr), align 8, !dbg !37
 @g_float_undef = internal global float undef, align 4, !dbg !83
+@_ZL4g_88 = internal global half 0xH5678, align 4, !dbg !88
+@_ZL4g_99 = internal global bfloat 0xR5CAE, align 4, !dbg !92
+@g_i32_undef = internal global i32 undef, align 4, !dbg !95
+@g_ptr_undef = internal global ptr undef, align 8, !dbg !97
 
 define dso_local void @_Z3barv() !dbg !46 {
 entry:
@@ -88,6 +104,15 @@ entry:
   store ptr %6, ptr @g_7, align 8, !dbg !59
   %l = load float, ptr @g_float_undef, align 8, !dbg !59
   store float %l, ptr @g_2, align 8, !dbg !59
+  %7 = load half, ptr @_ZL4g_88, align 4, !dbg !59
+  store half %7, ptr @g_8, align 4, !dbg !59
+  %8 = load bfloat, ptr @_ZL4g_99, align 4, !dbg !59
+  store bfloat %8, ptr @g_9, align 4, !dbg !59
+  %9 = load i32, ptr @g_i32_undef, align 4, !dbg !59
+  store i32 %9, ptr @g_1, align 4, !dbg !59
+  %10 = load ptr, ptr @g_ptr_undef, align 8, !dbg !59
+  store ptr %10, ptr @g_6, align 8, !dbg !59
+
   ret void, !dbg !59
 }
 
@@ -108,7 +133,7 @@ entry:
 !4 = !{!5}
 !5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64)
 !6 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
-!7 = !{!0, !8, !10, !13, !16, !19, !23, !25, !27, !29, !31, !33, !35, !37, !83}
+!7 = !{!0, !8, !10, !13, !16, !19, !23, !25, !27, !29, !31, !33, !35, !37, !83, !86, !88, !90, !92, !95, !97}
 !8 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression())
 !9 = distinct !DIGlobalVariable(name: "g_2", scope: !2, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true)
 !10 = !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
@@ -159,3 +184,17 @@ entry:
 !82 = !DILocation(line: 31, column: 1, scope: !77)
 !83 = !DIGlobalVariableExpression(var: !84, expr: !DIExpression())
 !84 = distinct !DIGlobalVariable(name: "g_float_undef", linkageName: "g_float_undef", scope: !2, file: !3, line: 15, type: !6, isLocal: true, isDefinition: true)
+!85 = !DIBasicType(name: "float", size: 16, encoding: DW_ATE_float)
+!86 = !DIGlobalVariableExpression(var: !87, expr: !DIExpression())
+!87 = distinct !DIGlobalVariable(name: "g_8", scope: !2, file: !3, line: 2, type: !85, isLocal: false, isDefinition: true)
+!88 = !DIGlobalVariableExpression(var: !89, expr: !DIExpression())
+!89 = distinct !DIGlobalVariable(name: "g_88", linkageName: "_ZL4g_88", scope: !2, file: !3, line: 10, type: !85, isLocal: true, isDefinition: true)
+!90 = !DIGlobalVariableExpression(var: !91, expr: !DIExpression())
+!91 = distinct !DIGlobalVariable(name: "g_9", scope: !2, file: !3, line: 2, type: !85, isLocal: false, isDefinition: true)
+!92 = !DIGlobalVariableExpression(var: !93, expr: !DIExpression())
+!93 = distinct !DIGlobalVariable(name: "g_99", linkageName: "_ZL4g_99", scope: !2, file: !3, line: 10, type: !85, isLocal: true, isDefinition: true)
+
+!95 = !DIGlobalVariableExpression(var: !96, expr: !DIExpression())
+!96 = distinct !DIGlobalVariable(name: "g_i32_undef", linkageName: "g_i32_undef", scope: !2, file: !3, line: 9, type: !22, isLocal: true, isDefinition: true)
+!97 = !DIGlobalVariableExpression(var: !98, expr: !DIExpression())
+!98 = distinct !DIGlobalVariable(name: "g_ptr_undef", linkageName: "g_ptr_undef", scope: !2, file: !3, line: 14, type: !21, isLocal: true, isDefinition: true)
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
index 0a947f6e206fef..4699fbbea5def0 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
@@ -22,7 +22,7 @@
 namespace llvm {
 namespace exegesis {
 
-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(__linux__)
 
 long SubprocessMemory::getCurrentTID() {
   // We're using the raw syscall here rather than the gettid() function provided
@@ -31,6 +31,8 @@ long SubprocessMemory::getCurrentTID() {
   return syscall(SYS_gettid);
 }
 
+#if !defined(__ANDROID__)
+
 Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) {
   // Add the PID to the shared memory name so that if we're running multiple
   // processes at the same time, they won't interfere with each other.
@@ -157,7 +159,8 @@ Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
 
 SubprocessMemory::~SubprocessMemory() {}
 
-#endif // defined(__linux__) && !defined(__ANDROID__)
+#endif // !defined(__ANDROID__)
+#endif // defined(__linux__)
 
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index 4865616e3e2ba1..d7e4dba4ac1703 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -59,6 +59,14 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
   EXPECT_EQ(UpgradeDataLayoutString("e-m:e-p:64:64-i64:64-i128:128-n64-S128",
                                     "riscv64"),
             "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128");
+
+  // Check that SPIR && SPIRV targets add -G1 if it's not present.
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir"), "e-p:32:32-G1");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir64"), "e-p:32:32-G1");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spirv32"), "e-p:32:32-G1");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spirv64"), "e-p:32:32-G1");
+  // but that SPIRV Logical does not.
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spirv"), "e-p:32:32");
 }
 
 TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
@@ -100,6 +108,17 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
             "p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128-p9:192:256:256:32");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
             "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128-p9:192:256:256:32");
+
+  // Check that SPIR & SPIRV targets don't add -G1 if there is already a -G
+  // flag.
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spir"), "e-p:32:32-G2");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spir64"), "e-p:32:32-G2");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spirv32"), "e-p:32:32-G2");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spirv64"), "e-p:32:32-G2");
+  EXPECT_EQ(UpgradeDataLayoutString("G2", "spir"), "G2");
+  EXPECT_EQ(UpgradeDataLayoutString("G2", "spir64"), "G2");
+  EXPECT_EQ(UpgradeDataLayoutString("G2", "spirv32"), "G2");
+  EXPECT_EQ(UpgradeDataLayoutString("G2", "spirv64"), "G2");
 }
 
 TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
@@ -113,6 +132,14 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
   EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "G1");
   EXPECT_EQ(UpgradeDataLayoutString("", "amdgcn"),
             "G1-ni:7:8:9-p7:160:256:256:32-p8:128:128-p9:192:256:256:32");
+
+  // Check that SPIR & SPIRV targets add G1 if it's not present.
+  EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1");
+  EXPECT_EQ(UpgradeDataLayoutString("", "spir64"), "G1");
+  EXPECT_EQ(UpgradeDataLayoutString("", "spirv32"), "G1");
+  EXPECT_EQ(UpgradeDataLayoutString("", "spirv64"), "G1");
+  // but SPIRV Logical does not.
+  EXPECT_EQ(UpgradeDataLayoutString("", "spirv"), "");
 }
 
 } // end namespace
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index a86775a1366b05..d7d0ea2c6a6e79 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -1241,6 +1241,18 @@ TEST(Local, ExpressionForConstant) {
   EXPECT_NE(Expr, nullptr);
   EXPECT_EQ(Expr->getElement(1), 13841306799765140275U);
 
+  // Half.
+  Type *HalfTy = Type::getHalfTy(Context);
+  Expr = createExpression(ConstantFP::get(HalfTy, 5.55), HalfTy);
+  EXPECT_NE(Expr, nullptr);
+  EXPECT_EQ(Expr->getElement(1), 17805U);
+
+  // BFloat.
+  Type *BFloatTy = Type::getBFloatTy(Context);
+  Expr = createExpression(ConstantFP::get(BFloatTy, -5.55), BFloatTy);
+  EXPECT_NE(Expr, nullptr);
+  EXPECT_EQ(Expr->getElement(1), 49330U);
+
   // Pointer.
   PointerType *PtrTy = PointerType::get(Context, 0);
   Expr = createExpression(ConstantPointerNull::get(PtrTy), PtrTy);
@@ -1257,15 +1269,6 @@ TEST(Local, ExpressionForConstant) {
   EXPECT_NE(Expr, nullptr);
   EXPECT_EQ(Expr->getElement(1), 5678U);
 
-  // Others.
-  Type *HalfTy = Type::getHalfTy(Context);
-  Expr = createExpression(ConstantFP::get(HalfTy, 32), HalfTy);
-  EXPECT_EQ(Expr, nullptr);
-
-  Type *BFloatTy = Type::getBFloatTy(Context);
-  Expr = createExpression(ConstantFP::get(BFloatTy, 32), BFloatTy);
-  EXPECT_EQ(Expr, nullptr);
-
   Type *FP128Ty = Type::getFP128Ty(Context);
   Expr = createExpression(ConstantFP::get(FP128Ty, 32), FP128Ty);
   EXPECT_EQ(Expr, nullptr);
diff --git a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
index 8e840e744064d5..1ea73752208156 100644
--- a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
@@ -53,6 +53,17 @@ void reorderOperandsByHoistability(RewriterBase &rewriter, AffineApplyOp op);
 /// maximally compose chains of AffineApplyOps.
 FailureOr<AffineApplyOp> decompose(RewriterBase &rewriter, AffineApplyOp op);
 
+/// Reify a bound for the given variable in terms of SSA values for which
+/// `stopCondition` is met.
+///
+/// By default, lower/equal bounds are closed and upper bounds are open. If
+/// `closedUB` is set to "true", upper bounds are also closed.
+FailureOr<OpFoldResult>
+reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type,
+                const ValueBoundsConstraintSet::Variable &var,
+                ValueBoundsConstraintSet::StopConditionFn stopCondition,
+                bool closedUB = false);
+
 /// Reify a bound for the given index-typed value in terms of SSA values for
 /// which `stopCondition` is met. If no stop condition is specified, reify in
 /// terms of the operands of the owner op.
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
index 970a52a06a11a2..bbc7e5d3e0dd70 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
@@ -24,6 +24,17 @@ enum class BoundType;
 
 namespace arith {
 
+/// Reify a bound for the given variable in terms of SSA values for which
+/// `stopCondition` is met.
+///
+/// By default, lower/equal bounds are closed and upper bounds are open. If
+/// `closedUB` is set to "true", upper bounds are also closed.
+FailureOr<OpFoldResult>
+reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type,
+                const ValueBoundsConstraintSet::Variable &var,
+                ValueBoundsConstraintSet::StopConditionFn stopCondition,
+                bool closedUB = false);
+
 /// Reify a bound for the given index-typed value in terms of SSA values for
 /// which `stopCondition` is met. If no stop condition is specified, reify in
 /// terms of the operands of the owner op.
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
index 304a9740d91ed3..27a766aceb3160 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
@@ -284,11 +284,10 @@ using TaskgroupClauseOps =
     detail::Clauses<AllocateClauseOps, TaskReductionClauseOps>;
 
 using TaskloopClauseOps =
-    detail::Clauses<AllocateClauseOps, CollapseClauseOps, FinalClauseOps,
-                    GrainsizeClauseOps, IfClauseOps, InReductionClauseOps,
-                    LoopRelatedOps, MergeableClauseOps, NogroupClauseOps,
-                    NumTasksClauseOps, PriorityClauseOps, PrivateClauseOps,
-                    ReductionClauseOps, UntiedClauseOps>;
+    detail::Clauses<AllocateClauseOps, FinalClauseOps, GrainsizeClauseOps,
+                    IfClauseOps, InReductionClauseOps, MergeableClauseOps,
+                    NogroupClauseOps, NumTasksClauseOps, PriorityClauseOps,
+                    PrivateClauseOps, ReductionClauseOps, UntiedClauseOps>;
 
 using TaskwaitClauseOps = detail::Clauses<DependClauseOps, NowaitClauseOps>;
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 3abdbe3adfd0be..82be7ad31a158f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -840,7 +840,8 @@ def YieldOp : OpenMP_Op<"yield",
 //===----------------------------------------------------------------------===//
 def DistributeOp : OpenMP_Op<"distribute", [AttrSizedOperandSegments,
                              DeclareOpInterfaceMethods<LoopWrapperInterface>,
-                             RecursiveMemoryEffects]> {
+                             RecursiveMemoryEffects,
+                             SingleBlockImplicitTerminator<"TerminatorOp">]> {
   let summary = "distribute construct";
   let description = [{
     The distribute construct specifies that the iterations of one or more loops
@@ -855,15 +856,28 @@ def DistributeOp : OpenMP_Op<"distribute", [AttrSizedOperandSegments,
     The distribute loop construct specifies that the iterations of the loop(s)
     will be executed in parallel by threads in the current context. These
     iterations are spread across threads that already exist in the enclosing
-    region. The lower and upper bounds specify a half-open range: the
-    range includes the lower bound but does not include the upper bound. If the
-    `inclusive` attribute is specified then the upper bound is also included.
+    region.
+    
+    The body region can contain a single block which must contain a single
+    operation and a terminator. The operation must be another compatible loop
+    wrapper or an `omp.loop_nest`.
 
     The `dist_schedule_static` attribute specifies the  schedule for this
     loop, determining how the loop is distributed across the parallel threads.
     The optional `schedule_chunk` associated with this determines further
     controls this distribution.
 
+    ```mlir
+    omp.distribute <clauses> {
+      omp.loop_nest (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
+        %a = load %arrA[%i1, %i2] : memref<?x?xf32>
+        %b = load %arrB[%i1, %i2] : memref<?x?xf32>
+        %sum = arith.addf %a, %b : f32
+        store %sum, %arrC[%i1, %i2] : memref<?x?xf32>
+        omp.yield
+      }
+    }
+    ```
     // TODO: private_var, firstprivate_var, lastprivate_var, collapse
   }];
   let arguments = (ins
@@ -1016,10 +1030,10 @@ def TaskOp : OpenMP_Op<"task", [AttrSizedOperandSegments,
 }
 
 def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
-                           AutomaticAllocationScope, RecursiveMemoryEffects,
-                           AllTypesMatch<["lowerBound", "upperBound", "step"]>,
+                           AutomaticAllocationScope,
                            DeclareOpInterfaceMethods<LoopWrapperInterface>,
-                           ReductionClauseInterface]> {
+                           RecursiveMemoryEffects, ReductionClauseInterface,
+                           SingleBlockImplicitTerminator<"TerminatorOp">]> {
   let summary = "taskloop construct";
   let description = [{
     The taskloop construct specifies that the iterations of one or more
@@ -1027,21 +1041,19 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
     iterations are distributed across tasks generated by the construct and
     scheduled to be executed.
 
-    The `lowerBound` and `upperBound` specify a half-open range: the range
-    includes the lower bound but does not include the upper bound. If the
-    `inclusive` attribute is specified then the upper bound is also included.
-    The `step` specifies the loop step.
-
-    The body region can contain any number of blocks.
+    The body region can contain a single block which must contain a single
+    operation and a terminator. The operation must be another compatible loop
+    wrapper or an `omp.loop_nest`.
 
     ```
-    omp.taskloop <clauses>
-    for (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
-      %a = load %arrA[%i1, %i2] : memref<?x?xf32>
-      %b = load %arrB[%i1, %i2] : memref<?x?xf32>
-      %sum = arith.addf %a, %b : f32
-      store %sum, %arrC[%i1, %i2] : memref<?x?xf32>
-      omp.terminator
+    omp.taskloop <clauses> {
+      omp.loop_nest (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
+        %a = load %arrA[%i1, %i2] : memref<?x?xf32>
+        %b = load %arrB[%i1, %i2] : memref<?x?xf32>
+        %sum = arith.addf %a, %b : f32
+        store %sum, %arrC[%i1, %i2] : memref<?x?xf32>
+        omp.yield
+      }
     }
     ```
 
@@ -1118,11 +1130,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
     created.
   }];
 
-  let arguments = (ins Variadic<IntLikeType>:$lowerBound,
-                       Variadic<IntLikeType>:$upperBound,
-                       Variadic<IntLikeType>:$step,
-                       UnitAttr:$inclusive,
-                       Optional<I1>:$if_expr,
+  let arguments = (ins Optional<I1>:$if_expr,
                        Optional<I1>:$final_expr,
                        UnitAttr:$untied,
                        UnitAttr:$mergeable,
@@ -1165,8 +1173,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
           |`grain_size` `(` $grain_size `:` type($grain_size) `)`
           |`num_tasks` `(` $num_tasks `:` type($num_tasks) `)`
           |`nogroup` $nogroup
-    ) `for` custom<LoopControl>($region, $lowerBound, $upperBound, $step,
-                                  type($step), $inclusive) attr-dict
+    ) $region attr-dict
   }];
 
   let extraClassDeclaration = [{
diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
index 1d7bc6ea961cc3..ac17ace5a976d2 100644
--- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
@@ -15,6 +15,7 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 
 #include <queue>
@@ -111,6 +112,39 @@ class ValueBoundsConstraintSet
 public:
   static char ID;
 
+  /// A variable that can be added to the constraint set as a "column". The
+  /// value bounds infrastructure can compute bounds for variables and compare
+  /// two variables.
+  ///
+  /// Internally, a variable is represented as an affine map and operands.
+  class Variable {
+  public:
+    /// Construct a variable for an index-typed attribute or SSA value.
+    Variable(OpFoldResult ofr);
+
+    /// Construct a variable for an index-typed SSA value.
+    Variable(Value indexValue);
+
+    /// Construct a variable for a dimension of a shaped value.
+    Variable(Value shapedValue, int64_t dim);
+
+    /// Construct a variable for an index-typed attribute/SSA value or for a
+    /// dimension of a shaped value. A non-null dimension must be provided if
+    /// and only if `ofr` is a shaped value.
+    Variable(OpFoldResult ofr, std::optional<int64_t> dim);
+
+    /// Construct a variable for a map and its operands.
+    Variable(AffineMap map, ArrayRef<Variable> mapOperands);
+    Variable(AffineMap map, ArrayRef<Value> mapOperands);
+
+    MLIRContext *getContext() const { return map.getContext(); }
+
+  private:
+    friend class ValueBoundsConstraintSet;
+    AffineMap map;
+    ValueDimList mapOperands;
+  };
+
   /// The stop condition when traversing the backward slice of a shaped value/
   /// index-type value. The traversal continues until the stop condition
   /// evaluates to "true" for a value.
@@ -121,35 +155,31 @@ class ValueBoundsConstraintSet
   using StopConditionFn = std::function<bool(
       Value, std::optional<int64_t> /*dim*/, ValueBoundsConstraintSet &cstr)>;
 
-  /// Compute a bound for the given index-typed value or shape dimension size.
-  /// The computed bound is stored in `resultMap`. The operands of the bound are
-  /// stored in `mapOperands`. An operand is either an index-type SSA value
-  /// or a shaped value and a dimension.
+  /// Compute a bound for the given variable. The computed bound is stored in
+  /// `resultMap`. The operands of the bound are stored in `mapOperands`. An
+  /// operand is either an index-type SSA value or a shaped value and a
+  /// dimension.
   ///
-  /// `dim` must be `nullopt` if and only if `value` is index-typed. The bound
-  /// is computed in terms of values/dimensions for which `stopCondition`
-  /// evaluates to "true". To that end, the backward slice (reverse use-def
-  /// chain) of the given value is visited in a worklist-driven manner and the
-  /// constraint set is populated according to `ValueBoundsOpInterface` for each
-  /// visited value.
+  /// The bound is computed in terms of values/dimensions for which
+  /// `stopCondition` evaluates to "true". To that end, the backward slice
+  /// (reverse use-def chain) of the given value is visited in a worklist-driven
+  /// manner and the constraint set is populated according to
+  /// `ValueBoundsOpInterface` for each visited value.
   ///
   /// By default, lower/equal bounds are closed and upper bounds are open. If
   /// `closedUB` is set to "true", upper bounds are also closed.
-  static LogicalResult computeBound(AffineMap &resultMap,
-                                    ValueDimList &mapOperands,
-                                    presburger::BoundType type, Value value,
-                                    std::optional<int64_t> dim,
-                                    StopConditionFn stopCondition,
-                                    bool closedUB = false);
+  static LogicalResult
+  computeBound(AffineMap &resultMap, ValueDimList &mapOperands,
+               presburger::BoundType type, const Variable &var,
+               StopConditionFn stopCondition, bool closedUB = false);
 
   /// Compute a bound in terms of the values/dimensions in `dependencies`. The
   /// computed bound consists of only constant terms and dependent values (or
   /// dimension sizes thereof).
   static LogicalResult
   computeDependentBound(AffineMap &resultMap, ValueDimList &mapOperands,
-                        presburger::BoundType type, Value value,
-                        std::optional<int64_t> dim, ValueDimList dependencies,
-                        bool closedUB = false);
+                        presburger::BoundType type, const Variable &var,
+                        ValueDimList dependencies, bool closedUB = false);
 
   /// Compute a bound in that is independent of all values in `independencies`.
   ///
@@ -161,13 +191,10 @@ class ValueBoundsConstraintSet
   /// appear in the computed bound.
   static LogicalResult
   computeIndependentBound(AffineMap &resultMap, ValueDimList &mapOperands,
-                          presburger::BoundType type, Value value,
-                          std::optional<int64_t> dim, ValueRange independencies,
-                          bool closedUB = false);
+                          presburger::BoundType type, const Variable &var,
+                          ValueRange independencies, bool closedUB = false);
 
-  /// Compute a constant bound for the given affine map, where dims and symbols
-  /// are bound to the given operands. The affine map must have exactly one
-  /// result.
+  /// Compute a constant bound for the given variable.
   ///
   /// This function traverses the backward slice of the given operands in a
   /// worklist-driven manner until `stopCondition` evaluates to "true". The
@@ -182,16 +209,9 @@ class ValueBoundsConstraintSet
   /// By default, lower/equal bounds are closed and upper bounds are open. If
   /// `closedUB` is set to "true", upper bounds are also closed.
   static FailureOr<int64_t>
-  computeConstantBound(presburger::BoundType type, Value value,
-                       std::optional<int64_t> dim = std::nullopt,
+  computeConstantBound(presburger::BoundType type, const Variable &var,
                        StopConditionFn stopCondition = nullptr,
                        bool closedUB = false);
-  static FailureOr<int64_t> computeConstantBound(
-      presburger::BoundType type, AffineMap map, ValueDimList mapOperands,
-      StopConditionFn stopCondition = nullptr, bool closedUB = false);
-  static FailureOr<int64_t> computeConstantBound(
-      presburger::BoundType type, AffineMap map, ArrayRef<Value> mapOperands,
-      StopConditionFn stopCondition = nullptr, bool closedUB = false);
 
   /// Compute a constant delta between the given two values. Return "failure"
   /// if a constant delta could not be determined.
@@ -221,9 +241,8 @@ class ValueBoundsConstraintSet
   /// proven. This could be because the specified relation does in fact not hold
   /// or because there is not enough information in the constraint set. In other
   /// words, if we do not know for sure, this function returns "false".
-  bool populateAndCompare(OpFoldResult lhs, std::optional<int64_t> lhsDim,
-                          ComparisonOperator cmp, OpFoldResult rhs,
-                          std::optional<int64_t> rhsDim);
+  bool populateAndCompare(const Variable &lhs, ComparisonOperator cmp,
+                          const Variable &rhs);
 
   /// Return "true" if "lhs cmp rhs" was proven to hold. Return "false" if the
   /// specified relation could not be proven. This could be because the
@@ -233,24 +252,12 @@ class ValueBoundsConstraintSet
   ///
   /// This function keeps traversing the backward slice of lhs/rhs until could
   /// prove the relation or until it ran out of IR.
-  static bool compare(OpFoldResult lhs, std::optional<int64_t> lhsDim,
-                      ComparisonOperator cmp, OpFoldResult rhs,
-                      std::optional<int64_t> rhsDim);
-  static bool compare(AffineMap lhs, ValueDimList lhsOperands,
-                      ComparisonOperator cmp, AffineMap rhs,
-                      ValueDimList rhsOperands);
-  static bool compare(AffineMap lhs, ArrayRef<Value> lhsOperands,
-                      ComparisonOperator cmp, AffineMap rhs,
-                      ArrayRef<Value> rhsOperands);
-
-  /// Compute whether the given values/dimensions are equal. Return "failure" if
+  static bool compare(const Variable &lhs, ComparisonOperator cmp,
+                      const Variable &rhs);
+
+  /// Compute whether the given variables are equal. Return "failure" if
   /// equality could not be determined.
-  ///
-  /// `dim1`/`dim2` must be `nullopt` if and only if `value1`/`value2` are
-  /// index-typed.
-  static FailureOr<bool> areEqual(OpFoldResult value1, OpFoldResult value2,
-                                  std::optional<int64_t> dim1 = std::nullopt,
-                                  std::optional<int64_t> dim2 = std::nullopt);
+  static FailureOr<bool> areEqual(const Variable &var1, const Variable &var2);
 
   /// Return "true" if the given slices are guaranteed to be overlapping.
   /// Return "false" if the given slices are guaranteed to be non-overlapping.
@@ -317,9 +324,6 @@ class ValueBoundsConstraintSet
   ///
   /// This function does not analyze any IR and does not populate any additional
   /// constraints.
-  bool compareValueDims(OpFoldResult lhs, std::optional<int64_t> lhsDim,
-                        ComparisonOperator cmp, OpFoldResult rhs,
-                        std::optional<int64_t> rhsDim);
   bool comparePos(int64_t lhsPos, ComparisonOperator cmp, int64_t rhsPos);
 
   /// Given an affine map with a single result (and map operands), add a new
@@ -374,6 +378,7 @@ class ValueBoundsConstraintSet
   /// constraint system. Return the position of the new column. Any operands
   /// that were not analyzed yet are put on the worklist.
   int64_t insert(AffineMap map, ValueDimList operands, bool isSymbol = true);
+  int64_t insert(const Variable &var, bool isSymbol = true);
 
   /// Project out the given column in the constraint set.
   void projectOut(int64_t pos);
@@ -381,6 +386,8 @@ class ValueBoundsConstraintSet
   /// Project out all columns for which the condition holds.
   void projectOut(function_ref<bool(ValueDim)> condition);
 
+  void projectOutAnonymous(std::optional<int64_t> except = std::nullopt);
+
   /// Mapping of columns to values/shape dimensions.
   SmallVector<std::optional<ValueDim>> positionToValueDim;
   /// Reverse mapping of values/shape dimensions to columns.
diff --git a/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
index e0c3abe7a0f71d..82a9fb0d490882 100644
--- a/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -120,9 +120,7 @@ mlir::affine::fullyComposeAndComputeConstantDelta(Value value1, Value value2) {
   mapOperands.push_back(value1);
   mapOperands.push_back(value2);
   affine::fullyComposeAffineMapAndOperands(&map, &mapOperands);
-  ValueDimList valueDims;
-  for (Value v : mapOperands)
-    valueDims.push_back({v, std::nullopt});
   return ValueBoundsConstraintSet::computeConstantBound(
-      presburger::BoundType::EQ, map, valueDims);
+      presburger::BoundType::EQ,
+      ValueBoundsConstraintSet::Variable(map, mapOperands));
 }
diff --git a/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
index 117ee8e8701ad7..1a266b72d1f8d3 100644
--- a/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
@@ -16,16 +16,15 @@
 using namespace mlir;
 using namespace mlir::affine;
 
-static FailureOr<OpFoldResult>
-reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type,
-                Value value, std::optional<int64_t> dim,
-                ValueBoundsConstraintSet::StopConditionFn stopCondition,
-                bool closedUB) {
+FailureOr<OpFoldResult> mlir::affine::reifyValueBound(
+    OpBuilder &b, Location loc, presburger::BoundType type,
+    const ValueBoundsConstraintSet::Variable &var,
+    ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) {
   // Compute bound.
   AffineMap boundMap;
   ValueDimList mapOperands;
   if (failed(ValueBoundsConstraintSet::computeBound(
-          boundMap, mapOperands, type, value, dim, stopCondition, closedUB)))
+          boundMap, mapOperands, type, var, stopCondition, closedUB)))
     return failure();
 
   // Reify bound.
@@ -93,7 +92,7 @@ FailureOr<OpFoldResult> mlir::affine::reifyShapedValueDimBound(
     // the owner of `value`.
     return v != value;
   };
-  return reifyValueBound(b, loc, type, value, dim,
+  return reifyValueBound(b, loc, type, {value, dim},
                          stopCondition ? stopCondition : reifyToOperands,
                          closedUB);
 }
@@ -105,7 +104,7 @@ FailureOr<OpFoldResult> mlir::affine::reifyIndexValueBound(
                              ValueBoundsConstraintSet &cstr) {
     return v != value;
   };
-  return reifyValueBound(b, loc, type, value, /*dim=*/std::nullopt,
+  return reifyValueBound(b, loc, type, value,
                          stopCondition ? stopCondition : reifyToOperands,
                          closedUB);
 }
diff --git a/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp
index f0d43808bc45df..7cfcc4180539c2 100644
--- a/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -107,9 +107,9 @@ struct SelectOpInterface
     // If trueValue <= falseValue:
     // * result <= falseValue
     // * result >= trueValue
-    if (cstr.compare(trueValue, dim,
+    if (cstr.compare(/*lhs=*/{trueValue, dim},
                      ValueBoundsConstraintSet::ComparisonOperator::LE,
-                     falseValue, dim)) {
+                     /*rhs=*/{falseValue, dim})) {
       if (dim) {
         cstr.bound(value)[*dim] >= cstr.getExpr(trueValue, dim);
         cstr.bound(value)[*dim] <= cstr.getExpr(falseValue, dim);
@@ -121,9 +121,9 @@ struct SelectOpInterface
     // If falseValue <= trueValue:
     // * result <= trueValue
     // * result >= falseValue
-    if (cstr.compare(falseValue, dim,
+    if (cstr.compare(/*lhs=*/{falseValue, dim},
                      ValueBoundsConstraintSet::ComparisonOperator::LE,
-                     trueValue, dim)) {
+                     /*rhs=*/{trueValue, dim})) {
       if (dim) {
         cstr.bound(value)[*dim] >= cstr.getExpr(falseValue, dim);
         cstr.bound(value)[*dim] <= cstr.getExpr(trueValue, dim);
diff --git a/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp b/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp
index 79fabd6ed2e99a..f87f3d6350c022 100644
--- a/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp
@@ -449,7 +449,7 @@ struct IndexCastPattern final : NarrowingPattern<CastOp> {
       return failure();
 
     FailureOr<int64_t> ub = ValueBoundsConstraintSet::computeConstantBound(
-        presburger::BoundType::UB, in, /*dim=*/std::nullopt,
+        presburger::BoundType::UB, in,
         /*stopCondition=*/nullptr, /*closedUB=*/true);
     if (failed(ub))
       return failure();
diff --git a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
index fad221288f190e..5fb7953f937007 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
@@ -61,16 +61,15 @@ static Value buildArithValue(OpBuilder &b, Location loc, AffineMap map,
   return buildExpr(map.getResult(0));
 }
 
-static FailureOr<OpFoldResult>
-reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type,
-                Value value, std::optional<int64_t> dim,
-                ValueBoundsConstraintSet::StopConditionFn stopCondition,
-                bool closedUB) {
+FailureOr<OpFoldResult> mlir::arith::reifyValueBound(
+    OpBuilder &b, Location loc, presburger::BoundType type,
+    const ValueBoundsConstraintSet::Variable &var,
+    ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) {
   // Compute bound.
   AffineMap boundMap;
   ValueDimList mapOperands;
   if (failed(ValueBoundsConstraintSet::computeBound(
-          boundMap, mapOperands, type, value, dim, stopCondition, closedUB)))
+          boundMap, mapOperands, type, var, stopCondition, closedUB)))
     return failure();
 
   // Materialize tensor.dim/memref.dim ops.
@@ -128,7 +127,7 @@ FailureOr<OpFoldResult> mlir::arith::reifyShapedValueDimBound(
     // the owner of `value`.
     return v != value;
   };
-  return reifyValueBound(b, loc, type, value, dim,
+  return reifyValueBound(b, loc, type, {value, dim},
                          stopCondition ? stopCondition : reifyToOperands,
                          closedUB);
 }
@@ -140,7 +139,7 @@ FailureOr<OpFoldResult> mlir::arith::reifyIndexValueBound(
                              ValueBoundsConstraintSet &cstr) {
     return v != value;
   };
-  return reifyValueBound(b, loc, type, value, /*dim=*/std::nullopt,
+  return reifyValueBound(b, loc, type, value,
                          stopCondition ? stopCondition : reifyToOperands,
                          closedUB);
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
index 8c4b70db248989..518d2e138c02a9 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
@@ -72,8 +72,10 @@ static LogicalResult computePaddedShape(linalg::LinalgOp opToPad,
     // Otherwise, try to compute a constant upper bound for the size value.
     FailureOr<int64_t> upperBound =
         ValueBoundsConstraintSet::computeConstantBound(
-            presburger::BoundType::UB, opOperand->get(),
-            /*dim=*/i, /*stopCondition=*/nullptr, /*closedUB=*/true);
+            presburger::BoundType::UB,
+            {opOperand->get(),
+             /*dim=*/i},
+            /*stopCondition=*/nullptr, /*closedUB=*/true);
     if (failed(upperBound)) {
       LLVM_DEBUG(DBGS() << "----could not compute a bounding box for padding");
       return failure();
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
index ac896d6c30d049..71eb59d40836c1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -257,14 +257,12 @@ FailureOr<PromotionInfo> mlir::linalg::promoteSubviewAsNewBuffer(
     if (auto attr = llvm::dyn_cast_if_present<Attribute>(rangeValue.size)) {
       size = getValueOrCreateConstantIndexOp(b, loc, rangeValue.size);
     } else {
-      Value materializedSize =
-          getValueOrCreateConstantIndexOp(b, loc, rangeValue.size);
       FailureOr<int64_t> upperBound =
           ValueBoundsConstraintSet::computeConstantBound(
-              presburger::BoundType::UB, materializedSize, /*dim=*/std::nullopt,
+              presburger::BoundType::UB, rangeValue.size,
               /*stopCondition=*/nullptr, /*closedUB=*/true);
       size = failed(upperBound)
-                 ? materializedSize
+                 ? getValueOrCreateConstantIndexOp(b, loc, rangeValue.size)
                  : b.create<arith::ConstantIndexOp>(loc, *upperBound);
     }
     LLVM_DEBUG(llvm::dbgs() << "Extracted tightest: " << size << "\n");
diff --git a/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp b/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp
index 10ba508265e7b9..1f06318cbd60e0 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp
@@ -23,12 +23,11 @@ static FailureOr<OpFoldResult> makeIndependent(OpBuilder &b, Location loc,
                                                ValueRange independencies) {
   if (ofr.is<Attribute>())
     return ofr;
-  Value value = ofr.get<Value>();
   AffineMap boundMap;
   ValueDimList mapOperands;
   if (failed(ValueBoundsConstraintSet::computeIndependentBound(
-          boundMap, mapOperands, presburger::BoundType::UB, value,
-          /*dim=*/std::nullopt, independencies, /*closedUB=*/true)))
+          boundMap, mapOperands, presburger::BoundType::UB, ofr, independencies,
+          /*closedUB=*/true)))
     return failure();
   return affine::materializeComputedBound(b, loc, boundMap, mapOperands);
 }
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 90b49b2528b790..e500d0fca741fb 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1656,6 +1656,17 @@ LogicalResult DistributeOp::verify() {
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
+  if (!isWrapper())
+    return emitOpError() << "must be a loop wrapper";
+
+  if (LoopWrapperInterface nested = getNestedWrapper()) {
+    // Check for the allowed leaf constructs that may appear in a composite
+    // construct directly after DISTRIBUTE.
+    if (!isa<ParallelOp, SimdLoopOp>(nested))
+      return emitError() << "only supported nested wrappers are 'omp.parallel' "
+                            "and 'omp.simdloop'";
+  }
+
   return success();
 }
 
@@ -1818,9 +1829,8 @@ void TaskloopOp::build(OpBuilder &builder, OperationState &state,
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: reductionByRefAttr, privateVars, privatizers.
   TaskloopOp::build(
-      builder, state, clauses.loopLBVar, clauses.loopUBVar, clauses.loopStepVar,
-      clauses.loopInclusiveAttr, clauses.ifVar, clauses.finalVar,
-      clauses.untiedAttr, clauses.mergeableAttr, clauses.inReductionVars,
+      builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr,
+      clauses.mergeableAttr, clauses.inReductionVars,
       makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.reductionVars,
       makeArrayAttr(ctx, clauses.reductionDeclSymbols), clauses.priorityVar,
       clauses.allocateVars, clauses.allocatorVars, clauses.grainsizeVar,
@@ -1859,6 +1869,16 @@ LogicalResult TaskloopOp::verify() {
         "the grainsize clause and num_tasks clause are mutually exclusive and "
         "may not appear on the same taskloop directive");
   }
+
+  if (!isWrapper())
+    return emitOpError() << "must be a loop wrapper";
+
+  if (LoopWrapperInterface nested = getNestedWrapper()) {
+    // Check for the allowed leaf constructs that may appear in a composite
+    // construct directly after TASKLOOP.
+    if (!isa<SimdLoopOp>(nested))
+      return emitError() << "only supported nested wrapper is 'omp.simdloop'";
+  }
   return success();
 }
 
diff --git a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
index 087ffc438a830a..17a1c016ea16d5 100644
--- a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -61,12 +61,13 @@ struct ForOpInterface
     // An EQ constraint can be added if the yielded value (dimension size)
     // equals the corresponding block argument (dimension size).
     if (cstr.populateAndCompare(
-            yieldedValue, dim, ValueBoundsConstraintSet::ComparisonOperator::EQ,
-            iterArg, dim)) {
+            /*lhs=*/{yieldedValue, dim},
+            ValueBoundsConstraintSet::ComparisonOperator::EQ,
+            /*rhs=*/{iterArg, dim})) {
       if (dim.has_value()) {
         cstr.bound(value)[*dim] == cstr.getExpr(initArg, dim);
       } else {
-        cstr.bound(value) == initArg;
+        cstr.bound(value) == cstr.getExpr(initArg);
       }
     }
   }
@@ -113,8 +114,9 @@ struct IfOpInterface
     // * result <= elseValue
     // * result >= thenValue
     if (cstr.populateAndCompare(
-            thenValue, dim, ValueBoundsConstraintSet::ComparisonOperator::LE,
-            elseValue, dim)) {
+            /*lhs=*/{thenValue, dim},
+            ValueBoundsConstraintSet::ComparisonOperator::LE,
+            /*rhs=*/{elseValue, dim})) {
       if (dim) {
         cstr.bound(value)[*dim] >= cstr.getExpr(thenValue, dim);
         cstr.bound(value)[*dim] <= cstr.getExpr(elseValue, dim);
@@ -127,8 +129,9 @@ struct IfOpInterface
     // * result <= thenValue
     // * result >= elseValue
     if (cstr.populateAndCompare(
-            elseValue, dim, ValueBoundsConstraintSet::ComparisonOperator::LE,
-            thenValue, dim)) {
+            /*lhs=*/{elseValue, dim},
+            ValueBoundsConstraintSet::ComparisonOperator::LE,
+            /*rhs=*/{thenValue, dim})) {
       if (dim) {
         cstr.bound(value)[*dim] >= cstr.getExpr(elseValue, dim);
         cstr.bound(value)[*dim] <= cstr.getExpr(thenValue, dim);
diff --git a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
index fea2f659535bb4..7b4024b6861a72 100644
--- a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
@@ -101,38 +101,30 @@ FailureOr<scf::ForOp> mlir::scf::upliftWhileToForLoop(RewriterBase &rewriter,
 
   Block *afterBody = loop.getAfterBody();
   scf::YieldOp afterTerm = loop.getYieldOp();
-  auto argNumber = inductionVar.getArgNumber();
-  auto afterTermIndArg = afterTerm.getResults()[argNumber];
+  unsigned argNumber = inductionVar.getArgNumber();
+  Value afterTermIndArg = afterTerm.getResults()[argNumber];
 
-  auto inductionVarAfter = afterBody->getArgument(argNumber);
-
-  Value step;
+  Value inductionVarAfter = afterBody->getArgument(argNumber);
 
   // Find suitable `addi` op inside `after` block, one of the args must be an
   // Induction var passed from `before` block and second arg must be defined
   // outside of the loop and will be considered step value.
   // TODO: Add `subi` support?
-  for (auto &use : inductionVarAfter.getUses()) {
-    auto owner = dyn_cast<arith::AddIOp>(use.getOwner());
-    if (!owner)
-      continue;
-
-    auto other =
-        (inductionVarAfter == owner.getLhs() ? owner.getRhs() : owner.getLhs());
-    if (!dom.properlyDominates(other, loop))
-      continue;
-
-    if (afterTermIndArg != owner.getResult())
-      continue;
+  auto addOp = afterTermIndArg.getDefiningOp<arith::AddIOp>();
+  if (!addOp)
+    return rewriter.notifyMatchFailure(loop, "Didn't found suitable 'addi' op");
 
-    step = other;
-    break;
+  Value step;
+  if (addOp.getLhs() == inductionVarAfter) {
+    step = addOp.getRhs();
+  } else if (addOp.getRhs() == inductionVarAfter) {
+    step = addOp.getLhs();
   }
 
-  if (!step)
-    return rewriter.notifyMatchFailure(loop, "Didn't found suitable 'addi' op");
+  if (!step || !dom.properlyDominates(step, loop))
+    return rewriter.notifyMatchFailure(loop, "Invalid 'addi' form");
 
-  auto lb = loop.getInits()[argNumber];
+  Value lb = loop.getInits()[argNumber];
 
   assert(lb.getType().isIntOrIndex());
   assert(lb.getType() == ub.getType());
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index 67080d8e301c13..d25efcf50ec566 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -289,8 +289,7 @@ static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp,
 
   info.isAlignedToInnerTileSize = false;
   FailureOr<int64_t> cstSize = ValueBoundsConstraintSet::computeConstantBound(
-      presburger::BoundType::UB,
-      getValueOrCreateConstantIndexOp(b, loc, tileSize), /*dim=*/std::nullopt,
+      presburger::BoundType::UB, tileSize,
       /*stopCondition=*/nullptr, /*closedUB=*/true);
   std::optional<int64_t> cstInnerSize = getConstantIntValue(innerTileSize);
   if (!failed(cstSize) && cstInnerSize) {
diff --git a/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp b/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp
index 721730862d49b3..a89ce20048dff3 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp
@@ -28,7 +28,8 @@ static FailureOr<OpFoldResult> makeIndependent(OpBuilder &b, Location loc,
   ValueDimList mapOperands;
   if (failed(ValueBoundsConstraintSet::computeIndependentBound(
           boundMap, mapOperands, presburger::BoundType::UB, value,
-          /*dim=*/std::nullopt, independencies, /*closedUB=*/true)))
+          independencies,
+          /*closedUB=*/true)))
     return failure();
   return mlir::affine::materializeComputedBound(b, loc, boundMap, mapOperands);
 }
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
index 2dd91e2f7a1700..15381ec520e211 100644
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -154,7 +154,7 @@ bool mlir::tensor::isCastLikeInsertSliceOp(InsertSliceOp op) {
       continue;
     }
     FailureOr<bool> equalDimSize = ValueBoundsConstraintSet::areEqual(
-        op.getSource(), op.getResult(), srcDim, resultDim);
+        {op.getSource(), srcDim}, {op.getResult(), resultDim});
     if (failed(equalDimSize) || !*equalDimSize)
       return false;
     ++srcDim;
@@ -178,7 +178,7 @@ bool mlir::tensor::isCastLikeExtractSliceOp(ExtractSliceOp op) {
       continue;
     }
     FailureOr<bool> equalDimSize = ValueBoundsConstraintSet::areEqual(
-        op.getSource(), op.getResult(), dim, resultDim);
+        {op.getSource(), dim}, {op.getResult(), resultDim});
     if (failed(equalDimSize) || !*equalDimSize)
       return false;
     ++resultDim;
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index ffa4c0b55cad7c..87937591e60ad8 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -25,6 +25,12 @@ namespace mlir {
 #include "mlir/Interfaces/ValueBoundsOpInterface.cpp.inc"
 } // namespace mlir
 
+static Operation *getOwnerOfValue(Value value) {
+  if (auto bbArg = dyn_cast<BlockArgument>(value))
+    return bbArg.getOwner()->getParentOp();
+  return value.getDefiningOp();
+}
+
 HyperrectangularSlice::HyperrectangularSlice(ArrayRef<OpFoldResult> offsets,
                                              ArrayRef<OpFoldResult> sizes,
                                              ArrayRef<OpFoldResult> strides)
@@ -67,6 +73,83 @@ static std::optional<int64_t> getConstantIntValue(OpFoldResult ofr) {
   return std::nullopt;
 }
 
+ValueBoundsConstraintSet::Variable::Variable(OpFoldResult ofr)
+    : Variable(ofr, std::nullopt) {}
+
+ValueBoundsConstraintSet::Variable::Variable(Value indexValue)
+    : Variable(static_cast<OpFoldResult>(indexValue)) {}
+
+ValueBoundsConstraintSet::Variable::Variable(Value shapedValue, int64_t dim)
+    : Variable(static_cast<OpFoldResult>(shapedValue), std::optional(dim)) {}
+
+ValueBoundsConstraintSet::Variable::Variable(OpFoldResult ofr,
+                                             std::optional<int64_t> dim) {
+  Builder b(ofr.getContext());
+  if (auto constInt = ::getConstantIntValue(ofr)) {
+    assert(!dim && "expected no dim for index-typed values");
+    map = AffineMap::get(/*dimCount=*/0, /*symbolCount=*/0,
+                         b.getAffineConstantExpr(*constInt));
+    return;
+  }
+  Value value = cast<Value>(ofr);
+#ifndef NDEBUG
+  if (dim) {
+    assert(isa<ShapedType>(value.getType()) && "expected shaped type");
+  } else {
+    assert(value.getType().isIndex() && "expected index type");
+  }
+#endif // NDEBUG
+  map = AffineMap::get(/*dimCount=*/0, /*symbolCount=*/1,
+                       b.getAffineSymbolExpr(0));
+  mapOperands.emplace_back(value, dim);
+}
+
+ValueBoundsConstraintSet::Variable::Variable(AffineMap map,
+                                             ArrayRef<Variable> mapOperands) {
+  assert(map.getNumResults() == 1 && "expected single result");
+
+  // Turn all dims into symbols.
+  Builder b(map.getContext());
+  SmallVector<AffineExpr> dimReplacements, symReplacements;
+  for (int64_t i = 0, e = map.getNumDims(); i < e; ++i)
+    dimReplacements.push_back(b.getAffineSymbolExpr(i));
+  for (int64_t i = 0, e = map.getNumSymbols(); i < e; ++i)
+    symReplacements.push_back(b.getAffineSymbolExpr(i + map.getNumDims()));
+  AffineMap tmpMap = map.replaceDimsAndSymbols(
+      dimReplacements, symReplacements, /*numResultDims=*/0,
+      /*numResultSyms=*/map.getNumSymbols() + map.getNumDims());
+
+  // Inline operands.
+  DenseMap<AffineExpr, AffineExpr> replacements;
+  for (auto [index, var] : llvm::enumerate(mapOperands)) {
+    assert(var.map.getNumResults() == 1 && "expected single result");
+    assert(var.map.getNumDims() == 0 && "expected only symbols");
+    SmallVector<AffineExpr> symReplacements;
+    for (auto valueDim : var.mapOperands) {
+      auto it = llvm::find(this->mapOperands, valueDim);
+      if (it != this->mapOperands.end()) {
+        // There is already a symbol for this operand.
+        symReplacements.push_back(b.getAffineSymbolExpr(
+            std::distance(this->mapOperands.begin(), it)));
+      } else {
+        // This is a new operand: add a new symbol.
+        symReplacements.push_back(
+            b.getAffineSymbolExpr(this->mapOperands.size()));
+        this->mapOperands.push_back(valueDim);
+      }
+    }
+    replacements[b.getAffineSymbolExpr(index)] =
+        var.map.getResult(0).replaceSymbols(symReplacements);
+  }
+  this->map = tmpMap.replace(replacements, /*numResultDims=*/0,
+                             /*numResultSyms=*/this->mapOperands.size());
+}
+
+ValueBoundsConstraintSet::Variable::Variable(AffineMap map,
+                                             ArrayRef<Value> mapOperands)
+    : Variable(map, llvm::map_to_vector(mapOperands,
+                                        [](Value v) { return Variable(v); })) {}
+
 ValueBoundsConstraintSet::ValueBoundsConstraintSet(
     MLIRContext *ctx, StopConditionFn stopCondition)
     : builder(ctx), stopCondition(stopCondition) {
@@ -176,6 +259,11 @@ int64_t ValueBoundsConstraintSet::insert(Value value,
   assert(!valueDimToPosition.contains(valueDim) && "already mapped");
   int64_t pos = isSymbol ? cstr.appendVar(VarKind::Symbol)
                          : cstr.appendVar(VarKind::SetDim);
+  LLVM_DEBUG(llvm::dbgs() << "Inserting constraint set column " << pos
+                          << " for: " << value
+                          << " (dim: " << dim.value_or(kIndexValue)
+                          << ", owner: " << getOwnerOfValue(value)->getName()
+                          << ")\n");
   positionToValueDim.insert(positionToValueDim.begin() + pos, valueDim);
   // Update reverse mapping.
   for (int64_t i = pos, e = positionToValueDim.size(); i < e; ++i)
@@ -194,6 +282,8 @@ int64_t ValueBoundsConstraintSet::insert(Value value,
 int64_t ValueBoundsConstraintSet::insert(bool isSymbol) {
   int64_t pos = isSymbol ? cstr.appendVar(VarKind::Symbol)
                          : cstr.appendVar(VarKind::SetDim);
+  LLVM_DEBUG(llvm::dbgs() << "Inserting anonymous constraint set column " << pos
+                          << "\n");
   positionToValueDim.insert(positionToValueDim.begin() + pos, std::nullopt);
   // Update reverse mapping.
   for (int64_t i = pos, e = positionToValueDim.size(); i < e; ++i)
@@ -224,6 +314,10 @@ int64_t ValueBoundsConstraintSet::insert(AffineMap map, ValueDimList operands,
   return pos;
 }
 
+int64_t ValueBoundsConstraintSet::insert(const Variable &var, bool isSymbol) {
+  return insert(var.map, var.mapOperands, isSymbol);
+}
+
 int64_t ValueBoundsConstraintSet::getPos(Value value,
                                          std::optional<int64_t> dim) const {
 #ifndef NDEBUG
@@ -232,7 +326,10 @@ int64_t ValueBoundsConstraintSet::getPos(Value value,
           cast<BlockArgument>(value).getOwner()->isEntryBlock()) &&
          "unstructured control flow is not supported");
 #endif // NDEBUG
-
+  LLVM_DEBUG(llvm::dbgs() << "Getting pos for: " << value
+                          << " (dim: " << dim.value_or(kIndexValue)
+                          << ", owner: " << getOwnerOfValue(value)->getName()
+                          << ")\n");
   auto it =
       valueDimToPosition.find(std::make_pair(value, dim.value_or(kIndexValue)));
   assert(it != valueDimToPosition.end() && "expected mapped entry");
@@ -253,12 +350,6 @@ bool ValueBoundsConstraintSet::isMapped(Value value,
   return it != valueDimToPosition.end();
 }
 
-static Operation *getOwnerOfValue(Value value) {
-  if (auto bbArg = dyn_cast<BlockArgument>(value))
-    return bbArg.getOwner()->getParentOp();
-  return value.getDefiningOp();
-}
-
 void ValueBoundsConstraintSet::processWorklist() {
   LLVM_DEBUG(llvm::dbgs() << "Processing value bounds worklist...\n");
   while (!worklist.empty()) {
@@ -346,41 +437,47 @@ void ValueBoundsConstraintSet::projectOut(
   }
 }
 
+void ValueBoundsConstraintSet::projectOutAnonymous(
+    std::optional<int64_t> except) {
+  int64_t nextPos = 0;
+  while (nextPos < static_cast<int64_t>(positionToValueDim.size())) {
+    if (positionToValueDim[nextPos].has_value() || except == nextPos) {
+      ++nextPos;
+    } else {
+      projectOut(nextPos);
+      // The column was projected out so another column is now at that position.
+      // Do not increase the counter.
+    }
+  }
+}
+
 LogicalResult ValueBoundsConstraintSet::computeBound(
     AffineMap &resultMap, ValueDimList &mapOperands, presburger::BoundType type,
-    Value value, std::optional<int64_t> dim, StopConditionFn stopCondition,
-    bool closedUB) {
-#ifndef NDEBUG
-  assertValidValueDim(value, dim);
-#endif // NDEBUG
-
+    const Variable &var, StopConditionFn stopCondition, bool closedUB) {
+  MLIRContext *ctx = var.getContext();
   int64_t ubAdjustment = closedUB ? 0 : 1;
-  Builder b(value.getContext());
+  Builder b(ctx);
   mapOperands.clear();
 
   // Process the backward slice of `value` (i.e., reverse use-def chain) until
   // `stopCondition` is met.
-  ValueDim valueDim = std::make_pair(value, dim.value_or(kIndexValue));
-  ValueBoundsConstraintSet cstr(value.getContext(), stopCondition);
-  assert(!stopCondition(value, dim, cstr) &&
-         "stop condition should not be satisfied for starting point");
-  int64_t pos = cstr.insert(value, dim, /*isSymbol=*/false);
+  ValueBoundsConstraintSet cstr(ctx, stopCondition);
+  int64_t pos = cstr.insert(var, /*isSymbol=*/false);
+  assert(pos == 0 && "expected first column");
   cstr.processWorklist();
 
   // Project out all variables (apart from `valueDim`) that do not match the
   // stop condition.
   cstr.projectOut([&](ValueDim p) {
-    // Do not project out `valueDim`.
-    if (valueDim == p)
-      return false;
     auto maybeDim =
         p.second == kIndexValue ? std::nullopt : std::make_optional(p.second);
     return !stopCondition(p.first, maybeDim, cstr);
   });
+  cstr.projectOutAnonymous(/*except=*/pos);
 
   // Compute lower and upper bounds for `valueDim`.
   SmallVector<AffineMap> lb(1), ub(1);
-  cstr.cstr.getSliceBounds(pos, 1, value.getContext(), &lb, &ub,
+  cstr.cstr.getSliceBounds(pos, 1, ctx, &lb, &ub,
                            /*closedUB=*/true);
 
   // Note: There are TODOs in the implementation of `getSliceBounds`. In such a
@@ -477,10 +574,9 @@ LogicalResult ValueBoundsConstraintSet::computeBound(
 
 LogicalResult ValueBoundsConstraintSet::computeDependentBound(
     AffineMap &resultMap, ValueDimList &mapOperands, presburger::BoundType type,
-    Value value, std::optional<int64_t> dim, ValueDimList dependencies,
-    bool closedUB) {
+    const Variable &var, ValueDimList dependencies, bool closedUB) {
   return computeBound(
-      resultMap, mapOperands, type, value, dim,
+      resultMap, mapOperands, type, var,
       [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
         return llvm::is_contained(dependencies, std::make_pair(v, d));
       },
@@ -489,8 +585,7 @@ LogicalResult ValueBoundsConstraintSet::computeDependentBound(
 
 LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
     AffineMap &resultMap, ValueDimList &mapOperands, presburger::BoundType type,
-    Value value, std::optional<int64_t> dim, ValueRange independencies,
-    bool closedUB) {
+    const Variable &var, ValueRange independencies, bool closedUB) {
   // Return "true" if the given value is independent of all values in
   // `independencies`. I.e., neither the value itself nor any value in the
   // backward slice (reverse use-def chain) is contained in `independencies`.
@@ -516,7 +611,7 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
 
   // Reify bounds in terms of any independent values.
   return computeBound(
-      resultMap, mapOperands, type, value, dim,
+      resultMap, mapOperands, type, var,
       [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
         return isIndependent(v);
       },
@@ -524,35 +619,8 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
 }
 
 FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
-    presburger::BoundType type, Value value, std::optional<int64_t> dim,
-    StopConditionFn stopCondition, bool closedUB) {
-#ifndef NDEBUG
-  assertValidValueDim(value, dim);
-#endif // NDEBUG
-
-  AffineMap map =
-      AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
-                     Builder(value.getContext()).getAffineDimExpr(0));
-  return computeConstantBound(type, map, {{value, dim}}, stopCondition,
-                              closedUB);
-}
-
-FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
-    presburger::BoundType type, AffineMap map, ArrayRef<Value> operands,
+    presburger::BoundType type, const Variable &var,
     StopConditionFn stopCondition, bool closedUB) {
-  ValueDimList valueDims;
-  for (Value v : operands) {
-    assert(v.getType().isIndex() && "expected index type");
-    valueDims.emplace_back(v, std::nullopt);
-  }
-  return computeConstantBound(type, map, valueDims, stopCondition, closedUB);
-}
-
-FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
-    presburger::BoundType type, AffineMap map, ValueDimList operands,
-    StopConditionFn stopCondition, bool closedUB) {
-  assert(map.getNumResults() == 1 && "expected affine map with one result");
-
   // Default stop condition if none was specified: Keep adding constraints until
   // a bound could be computed.
   int64_t pos = 0;
@@ -562,8 +630,8 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
   };
 
   ValueBoundsConstraintSet cstr(
-      map.getContext(), stopCondition ? stopCondition : defaultStopCondition);
-  pos = cstr.populateConstraints(map, operands);
+      var.getContext(), stopCondition ? stopCondition : defaultStopCondition);
+  pos = cstr.populateConstraints(var.map, var.mapOperands);
   assert(pos == 0 && "expected `map` is the first column");
 
   // Compute constant bound for `valueDim`.
@@ -608,22 +676,13 @@ ValueBoundsConstraintSet::computeConstantDelta(Value value1, Value value2,
   Builder b(value1.getContext());
   AffineMap map = AffineMap::get(/*dimCount=*/2, /*symbolCount=*/0,
                                  b.getAffineDimExpr(0) - b.getAffineDimExpr(1));
-  return computeConstantBound(presburger::BoundType::EQ, map,
-                              {{value1, dim1}, {value2, dim2}});
+  return computeConstantBound(presburger::BoundType::EQ,
+                              Variable(map, {{value1, dim1}, {value2, dim2}}));
 }
 
-bool ValueBoundsConstraintSet::compareValueDims(OpFoldResult lhs,
-                                                std::optional<int64_t> lhsDim,
-                                                ComparisonOperator cmp,
-                                                OpFoldResult rhs,
-                                                std::optional<int64_t> rhsDim) {
-#ifndef NDEBUG
-  if (auto lhsVal = dyn_cast<Value>(lhs))
-    assertValidValueDim(lhsVal, lhsDim);
-  if (auto rhsVal = dyn_cast<Value>(rhs))
-    assertValidValueDim(rhsVal, rhsDim);
-#endif // NDEBUG
-
+bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos,
+                                          ComparisonOperator cmp,
+                                          int64_t rhsPos) {
   // This function returns "true" if "lhs CMP rhs" is proven to hold.
   //
   // Example for ComparisonOperator::LE and index-typed values: We would like to
@@ -640,50 +699,6 @@ bool ValueBoundsConstraintSet::compareValueDims(OpFoldResult lhs,
     return false;
   }
 
-  // EQ can be expressed as LE and GE.
-  if (cmp == EQ)
-    return compareValueDims(lhs, lhsDim, ComparisonOperator::LE, rhs, rhsDim) &&
-           compareValueDims(lhs, lhsDim, ComparisonOperator::GE, rhs, rhsDim);
-
-  // Construct inequality. For the above example: lhs > rhs.
-  // `IntegerRelation` inequalities are expressed in the "flattened" form and
-  // with ">= 0". I.e., lhs - rhs - 1 >= 0.
-  SmallVector<int64_t> eq(cstr.getNumCols(), 0);
-  auto addToEq = [&](OpFoldResult ofr, std::optional<int64_t> dim,
-                     int64_t factor) {
-    if (auto constVal = ::getConstantIntValue(ofr)) {
-      eq[cstr.getNumCols() - 1] += *constVal * factor;
-    } else {
-      eq[getPos(cast<Value>(ofr), dim)] += factor;
-    }
-  };
-  if (cmp == LT || cmp == LE) {
-    addToEq(lhs, lhsDim, 1);
-    addToEq(rhs, rhsDim, -1);
-  } else if (cmp == GT || cmp == GE) {
-    addToEq(lhs, lhsDim, -1);
-    addToEq(rhs, rhsDim, 1);
-  } else {
-    llvm_unreachable("unsupported comparison operator");
-  }
-  if (cmp == LE || cmp == GE)
-    eq[cstr.getNumCols() - 1] -= 1;
-
-  // Add inequality to the constraint set and check if it made the constraint
-  // set empty.
-  int64_t ineqPos = cstr.getNumInequalities();
-  cstr.addInequality(eq);
-  bool isEmpty = cstr.isEmpty();
-  cstr.removeInequality(ineqPos);
-  return isEmpty;
-}
-
-bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos,
-                                          ComparisonOperator cmp,
-                                          int64_t rhsPos) {
-  // This function returns "true" if "lhs CMP rhs" is proven to hold. For
-  // detailed documentation, see `compareValueDims`.
-
   // EQ can be expressed as LE and GE.
   if (cmp == EQ)
     return comparePos(lhsPos, ComparisonOperator::LE, rhsPos) &&
@@ -712,48 +727,17 @@ bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos,
   return isEmpty;
 }
 
-bool ValueBoundsConstraintSet::populateAndCompare(
-    OpFoldResult lhs, std::optional<int64_t> lhsDim, ComparisonOperator cmp,
-    OpFoldResult rhs, std::optional<int64_t> rhsDim) {
-#ifndef NDEBUG
-  if (auto lhsVal = dyn_cast<Value>(lhs))
-    assertValidValueDim(lhsVal, lhsDim);
-  if (auto rhsVal = dyn_cast<Value>(rhs))
-    assertValidValueDim(rhsVal, rhsDim);
-#endif // NDEBUG
-
-  if (auto lhsVal = dyn_cast<Value>(lhs))
-    populateConstraints(lhsVal, lhsDim);
-  if (auto rhsVal = dyn_cast<Value>(rhs))
-    populateConstraints(rhsVal, rhsDim);
-
-  return compareValueDims(lhs, lhsDim, cmp, rhs, rhsDim);
+bool ValueBoundsConstraintSet::populateAndCompare(const Variable &lhs,
+                                                  ComparisonOperator cmp,
+                                                  const Variable &rhs) {
+  int64_t lhsPos = populateConstraints(lhs.map, lhs.mapOperands);
+  int64_t rhsPos = populateConstraints(rhs.map, rhs.mapOperands);
+  return comparePos(lhsPos, cmp, rhsPos);
 }
 
-bool ValueBoundsConstraintSet::compare(OpFoldResult lhs,
-                                       std::optional<int64_t> lhsDim,
-                                       ComparisonOperator cmp, OpFoldResult rhs,
-                                       std::optional<int64_t> rhsDim) {
-  auto stopCondition = [&](Value v, std::optional<int64_t> dim,
-                           ValueBoundsConstraintSet &cstr) {
-    // Keep processing as long as lhs/rhs are not mapped.
-    if (auto lhsVal = dyn_cast<Value>(lhs))
-      if (!cstr.isMapped(lhsVal, dim))
-        return false;
-    if (auto rhsVal = dyn_cast<Value>(rhs))
-      if (!cstr.isMapped(rhsVal, dim))
-        return false;
-    // Keep processing as long as the relation cannot be proven.
-    return cstr.compareValueDims(lhs, lhsDim, cmp, rhs, rhsDim);
-  };
-
-  ValueBoundsConstraintSet cstr(lhs.getContext(), stopCondition);
-  return cstr.populateAndCompare(lhs, lhsDim, cmp, rhs, rhsDim);
-}
-
-bool ValueBoundsConstraintSet::compare(AffineMap lhs, ValueDimList lhsOperands,
-                                       ComparisonOperator cmp, AffineMap rhs,
-                                       ValueDimList rhsOperands) {
+bool ValueBoundsConstraintSet::compare(const Variable &lhs,
+                                       ComparisonOperator cmp,
+                                       const Variable &rhs) {
   int64_t lhsPos = -1, rhsPos = -1;
   auto stopCondition = [&](Value v, std::optional<int64_t> dim,
                            ValueBoundsConstraintSet &cstr) {
@@ -765,39 +749,17 @@ bool ValueBoundsConstraintSet::compare(AffineMap lhs, ValueDimList lhsOperands,
     return cstr.comparePos(lhsPos, cmp, rhsPos);
   };
   ValueBoundsConstraintSet cstr(lhs.getContext(), stopCondition);
-  lhsPos = cstr.insert(lhs, lhsOperands);
-  rhsPos = cstr.insert(rhs, rhsOperands);
-  cstr.processWorklist();
+  lhsPos = cstr.populateConstraints(lhs.map, lhs.mapOperands);
+  rhsPos = cstr.populateConstraints(rhs.map, rhs.mapOperands);
   return cstr.comparePos(lhsPos, cmp, rhsPos);
 }
 
-bool ValueBoundsConstraintSet::compare(AffineMap lhs,
-                                       ArrayRef<Value> lhsOperands,
-                                       ComparisonOperator cmp, AffineMap rhs,
-                                       ArrayRef<Value> rhsOperands) {
-  ValueDimList lhsValueDimOperands =
-      llvm::map_to_vector(lhsOperands, [](Value v) {
-        return std::make_pair(v, std::optional<int64_t>());
-      });
-  ValueDimList rhsValueDimOperands =
-      llvm::map_to_vector(rhsOperands, [](Value v) {
-        return std::make_pair(v, std::optional<int64_t>());
-      });
-  return ValueBoundsConstraintSet::compare(lhs, lhsValueDimOperands, cmp, rhs,
-                                           rhsValueDimOperands);
-}
-
-FailureOr<bool>
-ValueBoundsConstraintSet::areEqual(OpFoldResult value1, OpFoldResult value2,
-                                   std::optional<int64_t> dim1,
-                                   std::optional<int64_t> dim2) {
-  if (ValueBoundsConstraintSet::compare(value1, dim1, ComparisonOperator::EQ,
-                                        value2, dim2))
+FailureOr<bool> ValueBoundsConstraintSet::areEqual(const Variable &var1,
+                                                   const Variable &var2) {
+  if (ValueBoundsConstraintSet::compare(var1, ComparisonOperator::EQ, var2))
     return true;
-  if (ValueBoundsConstraintSet::compare(value1, dim1, ComparisonOperator::LT,
-                                        value2, dim2) ||
-      ValueBoundsConstraintSet::compare(value1, dim1, ComparisonOperator::GT,
-                                        value2, dim2))
+  if (ValueBoundsConstraintSet::compare(var1, ComparisonOperator::LT, var2) ||
+      ValueBoundsConstraintSet::compare(var1, ComparisonOperator::GT, var2))
     return false;
   return failure();
 }
@@ -833,7 +795,7 @@ ValueBoundsConstraintSet::areOverlappingSlices(MLIRContext *ctx,
       AffineMap foldedMap =
           foldAttributesIntoMap(b, map, ofrOperands, valueOperands);
       FailureOr<int64_t> constBound = computeConstantBound(
-          presburger::BoundType::EQ, foldedMap, valueOperands);
+          presburger::BoundType::EQ, Variable(foldedMap, valueOperands));
       foundUnknownBound |= failed(constBound);
       if (succeeded(constBound) && *constBound <= 0)
         return false;
@@ -850,7 +812,7 @@ ValueBoundsConstraintSet::areOverlappingSlices(MLIRContext *ctx,
       AffineMap foldedMap =
           foldAttributesIntoMap(b, map, ofrOperands, valueOperands);
       FailureOr<int64_t> constBound = computeConstantBound(
-          presburger::BoundType::EQ, foldedMap, valueOperands);
+          presburger::BoundType::EQ, Variable(foldedMap, valueOperands));
       foundUnknownBound |= failed(constBound);
       if (succeeded(constBound) && *constBound <= 0)
         return false;
diff --git a/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
index 23c6872dcebe94..935c08aceff548 100644
--- a/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
+++ b/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
@@ -131,3 +131,27 @@ func.func @compare_affine_min(%a: index, %b: index) {
   "test.compare"(%0, %a) {cmp = "LE"} : (index, index) -> ()
   return
 }
+
+// -----
+
+func.func @compare_const_map() {
+  %c5 = arith.constant 5 : index
+  // expected-remark @below{{true}}
+  "test.compare"(%c5) {cmp = "GT", rhs_map = affine_map<() -> (4)>}
+      : (index) -> ()
+  // expected-remark @below{{true}}
+  "test.compare"(%c5) {cmp = "LT", lhs_map = affine_map<() -> (4)>}
+      : (index) -> ()
+  return
+}
+
+// -----
+
+func.func @compare_maps(%a: index, %b: index) {
+  // expected-remark @below{{true}}
+  "test.compare"(%a, %b, %b, %a)
+      {cmp = "GT", lhs_map = affine_map<(d0, d1) -> (1 + d0 + d1)>,
+       rhs_map = affine_map<(d0, d1) -> (d0 + d1)>}
+      : (index, index, index, index) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 88dca1b85ee5f7..7f86a7f5b3182e 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -1580,10 +1580,11 @@ func.func @omp_cancellationpoint2() {
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testmemref = "test.memref"() : () -> (memref<i32>)
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
-  "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testmemref) ({
-  ^bb0(%arg3: i32, %arg4: i32):
-    "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, memref<i32>) -> ()
+  "omp.taskloop"(%testmemref) ({
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1593,10 +1594,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
-  "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32, %testf32_2) ({
-  ^bb0(%arg3: i32, %arg4: i32):
-    "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr, !llvm.ptr) -> ()
+  "omp.taskloop"(%testf32, %testf32_2) ({
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
 
@@ -1604,12 +1606,12 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
-  %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
-  "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32) ({
-  ^bb0(%arg3: i32, %arg4: i32):
-    "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr) -> ()
+  "omp.taskloop"(%testf32) ({
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (!llvm.ptr) -> ()
   return
 }
 
@@ -1619,10 +1621,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
-  "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32, %testf32_2) ({
-  ^bb0(%arg3: i32, %arg4: i32):
-    "omp.terminator"() : () -> ()
-  }) {in_reductions = [@add_f32], operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr, !llvm.ptr) -> ()
+  "omp.taskloop"(%testf32, %testf32_2) ({
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }) {in_reductions = [@add_f32], operandSegmentSizes = array<i32: 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
 
@@ -1630,12 +1633,12 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
-  %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
-  "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32_2) ({
-  ^bb0(%arg3: i32, %arg4: i32):
-    "omp.terminator"() : () -> ()
-  }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr) -> ()
+  "omp.taskloop"(%testf32) ({
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr) -> ()
   return
 }
 
@@ -1657,9 +1660,10 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{if a reduction clause is present on the taskloop directive, the nogroup clause must not be specified}}
-  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) nogroup
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    omp.terminator
+  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) nogroup {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
   }
   return
 }
@@ -1681,9 +1685,10 @@ combiner {
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{the same list item cannot appear in both a reduction and an in_reduction clause}}
-  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr) in_reduction(@add_f32 -> %testf32 : !llvm.ptr)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    omp.terminator
+  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr) in_reduction(@add_f32 -> %testf32 : !llvm.ptr) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
   }
   return
 }
@@ -1693,8 +1698,20 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testi64 = "test.i64"() : () -> (i64)
   // expected-error @below {{the grainsize clause and num_tasks clause are mutually exclusive and may not appear on the same taskloop directive}}
-  omp.taskloop grain_size(%testi64: i64) num_tasks(%testi64: i64)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+  omp.taskloop grain_size(%testi64: i64) num_tasks(%testi64: i64) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
+  // expected-error @below {{op must be a loop wrapper}}
+  omp.taskloop {
+    %0 = arith.constant 0 : i32
     omp.terminator
   }
   return
@@ -1702,6 +1719,21 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 
 // -----
 
+func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
+  // expected-error @below {{only supported nested wrapper is 'omp.simdloop'}}
+  omp.taskloop {
+    omp.distribute {
+      omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+        omp.yield
+      }
+      omp.terminator
+    }
+  }
+  return
+}
+
+// -----
+
 func.func @omp_threadprivate() {
   %1 = llvm.mlir.addressof @_QFsubEx : !llvm.ptr
   // expected-error @below {{op failed to verify that all of {sym_addr, tls_addr} have same type}}
@@ -1866,7 +1898,16 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
 
 // -----
 
-func.func @omp_distribute(%data_var : memref<i32>) -> () {
+func.func @omp_distribute_schedule(%chunk_size : i32) -> () {
+  // expected-error @below {{op chunk size set without dist_schedule_static being present}}
+  "omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 1, 0, 0>}> ({
+      "omp.terminator"() : () -> ()
+    }) : (i32) -> ()
+}
+
+// -----
+
+func.func @omp_distribute_allocate(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 0, 1, 0>}> ({
       "omp.terminator"() : () -> ()
@@ -1875,6 +1916,29 @@ func.func @omp_distribute(%data_var : memref<i32>) -> () {
 
 // -----
 
+func.func @omp_distribute_wrapper() -> () {
+  // expected-error @below {{op must be a loop wrapper}}
+  "omp.distribute"() ({
+      %0 = arith.constant 0 : i32
+      "omp.terminator"() : () -> ()
+    }) : () -> ()
+}
+
+// -----
+
+func.func @omp_distribute_nested_wrapper(%data_var : memref<i32>) -> () {
+  // expected-error @below {{only supported nested wrappers are 'omp.parallel' and 'omp.simdloop'}}
+  "omp.distribute"() ({
+      "omp.wsloop"() ({
+        %0 = arith.constant 0 : i32
+        "omp.terminator"() : () -> ()
+      }) : () -> ()
+      "omp.terminator"() : () -> ()
+    }) : () -> ()
+}
+
+// -----
+
 omp.private {type = private} @x.privatizer : i32 alloc {
 ^bb0(%arg0: i32):
   %0 = arith.constant 0.0 : f32
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 851d44ad984eef..802e1795b3fffb 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -171,6 +171,23 @@ func.func @omp_loop_nest(%lb : index, %ub : index, %step : index) -> () {
     omp.yield
   }
 
+  // TODO Remove induction variables from omp.wsloop.
+  omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) {
+    // CHECK: omp.loop_nest
+    // CHECK-SAME: (%{{.*}}) : index =
+    // CHECK-SAME: (%{{.*}}) to (%{{.*}}) step (%{{.*}})
+    "omp.loop_nest" (%lb, %ub, %step) ({
+    ^bb0(%iv2: index):
+      // CHECK: test.op1
+      "test.op1"(%lb) : (index) -> ()
+      // CHECK: test.op2
+      "test.op2"() : () -> ()
+      // CHECK: omp.yield
+      omp.yield
+    }) : (index, index, index) -> ()
+    omp.yield
+  }
+
   return
 }
 
@@ -209,6 +226,22 @@ func.func @omp_loop_nest_pretty(%lb : index, %ub : index, %step : index) -> () {
     omp.yield
   }
 
+  // TODO Remove induction variables from omp.wsloop.
+  omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) {
+    // CHECK: omp.loop_nest
+    // CHECK-SAME: (%{{.*}}) : index =
+    // CHECK-SAME: (%{{.*}}) to (%{{.*}}) step (%{{.*}})
+    omp.loop_nest (%iv2) : index = (%lb) to (%ub) step (%step)  {
+      // CHECK: test.op1
+      "test.op1"(%lb) : (index) -> ()
+      // CHECK: test.op2
+      "test.op2"() : () -> ()
+      // CHECK: omp.yield
+      omp.yield
+    }
+    omp.yield
+  }
+
   return
 }
 
@@ -559,30 +592,54 @@ func.func @omp_simdloop_pretty_multiple(%lb1 : index, %ub1 : index, %step1 : ind
 }
 
 // CHECK-LABEL: omp_distribute
-func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>) -> () {
+func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>, %arg0 : i32) -> () {
   // CHECK: omp.distribute
   "omp.distribute" () ({
-    omp.terminator
+    "omp.loop_nest" (%arg0, %arg0, %arg0) ({
+    ^bb0(%iv: i32):
+      "omp.yield"() : () -> ()
+    }) : (i32, i32, i32) -> ()
+    "omp.terminator"() : () -> ()
   }) {} : () -> ()
   // CHECK: omp.distribute
   omp.distribute {
-    omp.terminator
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
   }
   // CHECK: omp.distribute dist_schedule_static
   omp.distribute dist_schedule_static {
-    omp.terminator
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
   }
   // CHECK: omp.distribute dist_schedule_static chunk_size(%{{.+}} : i32)
   omp.distribute dist_schedule_static chunk_size(%chunk_size : i32) {
-    omp.terminator
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
   }
   // CHECK: omp.distribute order(concurrent)
   omp.distribute order(concurrent) {
-    omp.terminator
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
   }
   // CHECK: omp.distribute allocate(%{{.+}} : memref<i32> -> %{{.+}} : memref<i32>)
   omp.distribute allocate(%data_var : memref<i32> -> %data_var : memref<i32>) {
-    omp.terminator
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
+  }
+  // CHECK: omp.distribute
+  omp.distribute {
+    // TODO Remove induction variables from omp.simdloop.
+    omp.simdloop for (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.loop_nest (%iv2) : i32 = (%arg0) to (%arg0) step (%arg0) {
+        omp.yield
+      }
+      omp.yield
+    }
   }
 return
 }
@@ -2000,135 +2057,128 @@ func.func @omp_taskgroup_clauses() -> () {
 // CHECK-LABEL: @omp_taskloop
 func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
 
-  // CHECK: omp.taskloop for (%{{.+}}) : i32 = (%{{.+}}) to (%{{.+}}) step (%{{.+}}) {
-  omp.taskloop for (%i) : i32 = (%lb) to (%ub) step (%step)  {
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.taskloop for (%{{.+}}) : i32 = (%{{.+}}) to (%{{.+}}) step (%{{.+}}) {
-  omp.taskloop for (%i) : i32 = (%lb) to (%ub) step (%step)  {
-    // CHECK: test.op1
-    "test.op1"(%lb) : (i32) -> ()
-    // CHECK: test.op2
-    "test.op2"() : () -> ()
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.taskloop for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.taskloop for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) inclusive step (%{{.+}}, %{{.+}}) {
-  omp.taskloop for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) inclusive step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop {
+  omp.taskloop {
+    omp.loop_nest (%i) : i32 = (%lb) to (%ub) step (%step)  {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   %testbool = "test.bool"() : () -> (i1)
 
-  // CHECK: omp.taskloop if(%{{[^)]+}})
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop if(%testbool)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop if(%{{[^)]+}}) {
+  omp.taskloop if(%testbool) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop final(%{{[^)]+}})
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop final(%testbool)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop final(%{{[^)]+}}) {
+  omp.taskloop final(%testbool) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop untied
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop untied
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop untied {
+  omp.taskloop untied {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop mergeable
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop mergeable
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop mergeable {
+  omp.taskloop mergeable {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
-  // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) {
+  omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) {
+  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr) reduction(@add_f32 -> %{{.+}} : !llvm.ptr)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr) reduction(@add_f32 -> %testf32_2 : !llvm.ptr)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr) reduction(@add_f32 -> %{{.+}} : !llvm.ptr) {
+  omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr) reduction(@add_f32 -> %testf32_2 : !llvm.ptr) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   %testi32 = "test.i32"() : () -> (i32)
-  // CHECK: omp.taskloop priority(%{{[^:]+}}: i32)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop priority(%testi32: i32)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop priority(%{{[^:]+}}: i32) {
+  omp.taskloop priority(%testi32: i32) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   %testmemref = "test.memref"() : () -> (memref<i32>)
-  // CHECK: omp.taskloop allocate(%{{.+}} : memref<i32> -> %{{.+}} : memref<i32>)
-  omp.taskloop allocate(%testmemref : memref<i32> -> %testmemref : memref<i32>)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop allocate(%{{.+}} : memref<i32> -> %{{.+}} : memref<i32>) {
+  omp.taskloop allocate(%testmemref : memref<i32> -> %testmemref : memref<i32>) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   %testi64 = "test.i64"() : () -> (i64)
-  // CHECK: omp.taskloop grain_size(%{{[^:]+}}: i64)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop grain_size(%testi64: i64)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop grain_size(%{{[^:]+}}: i64) {
+  omp.taskloop grain_size(%testi64: i64) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop num_tasks(%{{[^:]+}}: i64)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop num_tasks(%testi64: i64)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop num_tasks(%{{[^:]+}}: i64) {
+  omp.taskloop num_tasks(%testi64: i64) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop nogroup
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop nogroup
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop nogroup {
+  omp.taskloop nogroup {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
+  }
+
+  // CHECK: omp.taskloop {
+  omp.taskloop {
+    // TODO Remove induction variables from omp.simdloop.
+    omp.simdloop for (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+        // CHECK: omp.yield
+        omp.yield
+      }
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   // CHECK: return
diff --git a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
index 6730f9b292ad93..b098a5a23fd316 100644
--- a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
@@ -109,7 +109,7 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp,
     FailureOr<OpFoldResult> reified = failure();
     if (constant) {
       auto reifiedConst = ValueBoundsConstraintSet::computeConstantBound(
-          boundType, value, dim, /*stopCondition=*/nullptr);
+          boundType, {value, dim}, /*stopCondition=*/nullptr);
       if (succeeded(reifiedConst))
         reified = FailureOr<OpFoldResult>(rewriter.getIndexAttr(*reifiedConst));
     } else if (scalable) {
@@ -128,22 +128,12 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp,
             rewriter, loc, reifiedScalable->map, vscaleOperand);
       }
     } else {
-      if (dim) {
-        if (useArithOps) {
-          reified = arith::reifyShapedValueDimBound(
-              rewriter, op->getLoc(), boundType, value, *dim, stopCondition);
-        } else {
-          reified = reifyShapedValueDimBound(rewriter, op->getLoc(), boundType,
-                                             value, *dim, stopCondition);
-        }
+      if (useArithOps) {
+        reified = arith::reifyValueBound(rewriter, op->getLoc(), boundType,
+                                         op.getVariable(), stopCondition);
       } else {
-        if (useArithOps) {
-          reified = arith::reifyIndexValueBound(
-              rewriter, op->getLoc(), boundType, value, stopCondition);
-        } else {
-          reified = reifyIndexValueBound(rewriter, op->getLoc(), boundType,
-                                         value, stopCondition);
-        }
+        reified = reifyValueBound(rewriter, op->getLoc(), boundType,
+                                  op.getVariable(), stopCondition);
       }
     }
     if (failed(reified)) {
@@ -188,9 +178,7 @@ static LogicalResult testEquality(func::FuncOp funcOp) {
     }
 
     auto compare = [&](ValueBoundsConstraintSet::ComparisonOperator cmp) {
-      return ValueBoundsConstraintSet::compare(
-          /*lhs=*/op.getLhs(), /*lhsDim=*/std::nullopt, cmp,
-          /*rhs=*/op.getRhs(), /*rhsDim=*/std::nullopt);
+      return ValueBoundsConstraintSet::compare(op.getLhs(), cmp, op.getRhs());
     };
     if (compare(cmpType)) {
       op->emitRemark("true");
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 25c5190ca0ef3a..a23ed89c4b04d1 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -549,6 +549,12 @@ LogicalResult ReifyBoundOp::verify() {
   return success();
 }
 
+::mlir::ValueBoundsConstraintSet::Variable ReifyBoundOp::getVariable() {
+  if (getDim().has_value())
+    return ValueBoundsConstraintSet::Variable(getVar(), *getDim());
+  return ValueBoundsConstraintSet::Variable(getVar());
+}
+
 ::mlir::ValueBoundsConstraintSet::ComparisonOperator
 CompareOp::getComparisonOperator() {
   if (getCmp() == "EQ")
@@ -564,6 +570,37 @@ CompareOp::getComparisonOperator() {
   llvm_unreachable("invalid comparison operator");
 }
 
+::mlir::ValueBoundsConstraintSet::Variable CompareOp::getLhs() {
+  if (!getLhsMap())
+    return ValueBoundsConstraintSet::Variable(getVarOperands()[0]);
+  SmallVector<Value> mapOperands(
+      getVarOperands().slice(0, getLhsMap()->getNumInputs()));
+  return ValueBoundsConstraintSet::Variable(*getLhsMap(), mapOperands);
+}
+
+::mlir::ValueBoundsConstraintSet::Variable CompareOp::getRhs() {
+  int64_t rhsOperandsBegin = getLhsMap() ? getLhsMap()->getNumInputs() : 1;
+  if (!getRhsMap())
+    return ValueBoundsConstraintSet::Variable(
+        getVarOperands()[rhsOperandsBegin]);
+  SmallVector<Value> mapOperands(
+      getVarOperands().slice(rhsOperandsBegin, getRhsMap()->getNumInputs()));
+  return ValueBoundsConstraintSet::Variable(*getRhsMap(), mapOperands);
+}
+
+LogicalResult CompareOp::verify() {
+  if (getCompose() && (getLhsMap() || getRhsMap()))
+    return emitOpError(
+        "'compose' not supported when 'lhs_map' or 'rhs_map' is present");
+  int64_t expectedNumOperands = getLhsMap() ? getLhsMap()->getNumInputs() : 1;
+  expectedNumOperands += getRhsMap() ? getRhsMap()->getNumInputs() : 1;
+  if (getVarOperands().size() != size_t(expectedNumOperands))
+    return emitOpError("expected ")
+           << expectedNumOperands << " operands, but got "
+           << getVarOperands().size();
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Test removing op with inner ops.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index ebf158b8bb8203..b641b3da719c78 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2207,6 +2207,7 @@ def ReifyBoundOp : TEST_Op<"reify_bound", [Pure]> {
 
   let extraClassDeclaration = [{
     ::mlir::presburger::BoundType getBoundType();
+    ::mlir::ValueBoundsConstraintSet::Variable getVariable();
   }];
 
   let hasVerifier = 1;
@@ -2217,18 +2218,29 @@ def CompareOp : TEST_Op<"compare"> {
     Compare `lhs` and `rhs`. A remark is emitted which indicates whether the
     specified comparison operator was proven to hold. The remark also indicates
     whether the opposite comparison operator was proven to hold.
+
+    `var_operands` must have exactly two operands: one for the LHS operand and
+    one for the RHS operand. If `lhs_map` is specified, as many operands as
+    `lhs_map` has inputs are expected instead of the first operand. If `rhs_map`
+    is specified, as many operands as `rhs_map` has inputs are expected instead
+    of the second operand.
   }];
 
-  let arguments = (ins Index:$lhs,
-                       Index:$rhs,
+  let arguments = (ins Variadic<Index>:$var_operands,
                        DefaultValuedAttr<StrAttr, "\"EQ\"">:$cmp,
+                       OptionalAttr<AffineMapAttr>:$lhs_map,
+                       OptionalAttr<AffineMapAttr>:$rhs_map,
                        UnitAttr:$compose);
   let results = (outs);
 
   let extraClassDeclaration = [{
     ::mlir::ValueBoundsConstraintSet::ComparisonOperator
         getComparisonOperator();
+    ::mlir::ValueBoundsConstraintSet::Variable getLhs();
+    ::mlir::ValueBoundsConstraintSet::Variable getRhs();
   }];
+
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//