diff --git a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
index eae0d84721c43..cfc8ff653f169 100644
--- a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
+++ b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
@@ -268,8 +268,7 @@ int main(int argc, const char **argv) {
   Error = false;
   llvm::sys::Mutex IndexMutex;
   // ExecutorConcurrency is a flag exposed by AllTUsExecution.h
-  llvm::ThreadPool Pool(ExecutorConcurrency == 0 ? llvm::hardware_concurrency()
-                                                 : ExecutorConcurrency);
+  llvm::ThreadPool Pool(llvm::hardware_concurrency(ExecutorConcurrency));
   for (auto &Group : USRToBitcode) {
     Pool.async([&]() {
       std::vector<std::unique_ptr<doc::Info>> Infos;
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index 52c1ceef74259..3fbf98970cceb 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -87,9 +87,8 @@ CompletionItemKind toCompletionItemKind(index::SymbolKind Kind) {
     return CompletionItemKind::Text;
   case SK::Enum:
     return CompletionItemKind::Enum;
-  // FIXME(ioeric): use LSP struct instead of class when it is suppoted in the
-  // protocol.
   case SK::Struct:
+    return CompletionItemKind::Struct;
   case SK::Class:
   case SK::Protocol:
   case SK::Extension:
@@ -102,18 +101,16 @@ CompletionItemKind toCompletionItemKind(index::SymbolKind Kind) {
   case SK::Using:
     return CompletionItemKind::Reference;
   case SK::Function:
-  // FIXME(ioeric): this should probably be an operator. This should be fixed
-  // when `Operator` is support type in the protocol.
   case SK::ConversionFunction:
     return CompletionItemKind::Function;
   case SK::Variable:
   case SK::Parameter:
+  case SK::NonTypeTemplateParm:
     return CompletionItemKind::Variable;
   case SK::Field:
     return CompletionItemKind::Field;
-  // FIXME(ioeric): use LSP enum constant when it is supported in the protocol.
   case SK::EnumConstant:
-    return CompletionItemKind::Value;
+    return CompletionItemKind::EnumMember;
   case SK::InstanceMethod:
   case SK::ClassMethod:
   case SK::StaticMethod:
@@ -125,6 +122,9 @@ CompletionItemKind toCompletionItemKind(index::SymbolKind Kind) {
     return CompletionItemKind::Property;
   case SK::Constructor:
     return CompletionItemKind::Constructor;
+  case SK::TemplateTypeParm:
+  case SK::TemplateTemplateParm:
+    return CompletionItemKind::TypeParameter;
   }
   llvm_unreachable("Unhandled clang::index::SymbolKind.");
 }
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index ae4c441a73b57..750df50c47777 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -115,15 +115,6 @@ std::string printDefinition(const Decl *D) {
   return Definition;
 }
 
-void printParams(llvm::raw_ostream &OS,
-                 const std::vector<HoverInfo::Param> &Params) {
-  for (size_t I = 0, E = Params.size(); I != E; ++I) {
-    if (I)
-      OS << ", ";
-    OS << Params.at(I);
-  }
-}
-
 std::string printType(QualType QT, const PrintingPolicy &Policy) {
   // TypePrinter doesn't resolve decltypes, so resolve them here.
   // FIXME: This doesn't handle composite types that contain a decltype in them.
@@ -133,6 +124,43 @@ std::string printType(QualType QT, const PrintingPolicy &Policy) {
   return QT.getAsString(Policy);
 }
 
+std::string printType(const TemplateTypeParmDecl *TTP) {
+  std::string Res = TTP->wasDeclaredWithTypename() ? "typename" : "class";
+  if (TTP->isParameterPack())
+    Res += "...";
+  return Res;
+}
+
+std::string printType(const NonTypeTemplateParmDecl *NTTP,
+                      const PrintingPolicy &PP) {
+  std::string Res = printType(NTTP->getType(), PP);
+  if (NTTP->isParameterPack())
+    Res += "...";
+  return Res;
+}
+
+std::string printType(const TemplateTemplateParmDecl *TTP,
+                      const PrintingPolicy &PP) {
+  std::string Res;
+  llvm::raw_string_ostream OS(Res);
+  OS << "template <";
+  llvm::StringRef Sep = "";
+  for (const Decl *Param : *TTP->getTemplateParameters()) {
+    OS << Sep;
+    Sep = ", ";
+    if (const auto *TTP = dyn_cast<TemplateTypeParmDecl>(Param))
+      OS << printType(TTP);
+    else if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(Param))
+      OS << printType(NTTP, PP);
+    else if (const auto *TTPD = dyn_cast<TemplateTemplateParmDecl>(Param))
+      OS << printType(TTPD, PP);
+  }
+  // FIXME: TemplateTemplateParameter doesn't store the info on whether this
+  // param was a "typename" or "class".
+  OS << "> class";
+  return OS.str();
+}
+
 std::vector<HoverInfo::Param>
 fetchTemplateParameters(const TemplateParameterList *Params,
                         const PrintingPolicy &PP) {
@@ -142,38 +170,30 @@ fetchTemplateParameters(const TemplateParameterList *Params,
   for (const Decl *Param : *Params) {
     HoverInfo::Param P;
     if (const auto *TTP = dyn_cast<TemplateTypeParmDecl>(Param)) {
-      P.Type = TTP->wasDeclaredWithTypename() ? "typename" : "class";
-      if (TTP->isParameterPack())
-        *P.Type += "...";
+      P.Type = printType(TTP);
 
       if (!TTP->getName().empty())
         P.Name = TTP->getNameAsString();
+
       if (TTP->hasDefaultArgument())
         P.Default = TTP->getDefaultArgument().getAsString(PP);
     } else if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
+      P.Type = printType(NTTP, PP);
+
       if (IdentifierInfo *II = NTTP->getIdentifier())
         P.Name = II->getName().str();
 
-      P.Type = printType(NTTP->getType(), PP);
-      if (NTTP->isParameterPack())
-        *P.Type += "...";
-
       if (NTTP->hasDefaultArgument()) {
         P.Default.emplace();
         llvm::raw_string_ostream Out(*P.Default);
         NTTP->getDefaultArgument()->printPretty(Out, nullptr, PP);
       }
     } else if (const auto *TTPD = dyn_cast<TemplateTemplateParmDecl>(Param)) {
-      P.Type.emplace();
-      llvm::raw_string_ostream OS(*P.Type);
-      OS << "template <";
-      printParams(OS,
-                  fetchTemplateParameters(TTPD->getTemplateParameters(), PP));
-      OS << "> class"; // FIXME: TemplateTemplateParameter doesn't store the
-                       // info on whether this param was a "typename" or
-                       // "class".
+      P.Type = printType(TTPD, PP);
+
       if (!TTPD->getName().empty())
         P.Name = TTPD->getNameAsString();
+
       if (TTPD->hasDefaultArgument()) {
         P.Default.emplace();
         llvm::raw_string_ostream Out(*P.Default);
@@ -385,6 +405,10 @@ HoverInfo getHoverContents(const NamedDecl *D, const SymbolIndex *Index) {
     fillFunctionTypeAndParams(HI, D, FD, Policy);
   else if (const auto *VD = dyn_cast<ValueDecl>(D))
     HI.Type = printType(VD->getType(), Policy);
+  else if (const auto *TTP = dyn_cast<TemplateTypeParmDecl>(D))
+    HI.Type = TTP->wasDeclaredWithTypename() ? "typename" : "class";
+  else if (const auto *TTP = dyn_cast<TemplateTemplateParmDecl>(D))
+    HI.Type = printType(TTP, Policy);
 
   // Fill in value with evaluated initializer if possible.
   if (const auto *Var = dyn_cast<VarDecl>(D)) {
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index 1e71c2ab37f5e..8e89c1f45f3a5 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -14,6 +14,7 @@
 #include "Logger.h"
 #include "URI.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/Index/IndexSymbol.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -261,9 +262,13 @@ SymbolKind indexSymbolKindToSymbolKind(index::SymbolKind Kind) {
   case index::SymbolKind::ConversionFunction:
     return SymbolKind::Function;
   case index::SymbolKind::Parameter:
+  case index::SymbolKind::NonTypeTemplateParm:
     return SymbolKind::Variable;
   case index::SymbolKind::Using:
     return SymbolKind::Namespace;
+  case index::SymbolKind::TemplateTemplateParm:
+  case index::SymbolKind::TemplateTypeParm:
+    return SymbolKind::TypeParameter;
   }
   llvm_unreachable("invalid symbol kind");
 }
diff --git a/clang-tools-extra/clangd/Quality.cpp b/clang-tools-extra/clangd/Quality.cpp
index d80790fc98083..2261ff61e4990 100644
--- a/clang-tools-extra/clangd/Quality.cpp
+++ b/clang-tools-extra/clangd/Quality.cpp
@@ -129,6 +129,8 @@ categorize(const index::SymbolInfo &D) {
   case index::SymbolKind::Extension:
   case index::SymbolKind::Union:
   case index::SymbolKind::TypeAlias:
+  case index::SymbolKind::TemplateTypeParm:
+  case index::SymbolKind::TemplateTemplateParm:
     return SymbolQualitySignals::Type;
   case index::SymbolKind::Function:
   case index::SymbolKind::ClassMethod:
@@ -147,6 +149,7 @@ categorize(const index::SymbolInfo &D) {
   case index::SymbolKind::Field:
   case index::SymbolKind::EnumConstant:
   case index::SymbolKind::Parameter:
+  case index::SymbolKind::NonTypeTemplateParm:
     return SymbolQualitySignals::Variable;
   case index::SymbolKind::Using:
   case index::SymbolKind::Module:
diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp
index 5a1caa9645209..f59c19e8031ee 100644
--- a/clang-tools-extra/clangd/TUScheduler.cpp
+++ b/clang-tools-extra/clangd/TUScheduler.cpp
@@ -842,13 +842,7 @@ std::string renderTUAction(const TUAction &Action) {
 } // namespace
 
 unsigned getDefaultAsyncThreadsCount() {
-  unsigned HardwareConcurrency = llvm::heavyweight_hardware_concurrency();
-  // heavyweight_hardware_concurrency may fall back to hardware_concurrency.
-  // C++ standard says that hardware_concurrency() may return 0; fallback to 1
-  // worker thread in that case.
-  if (HardwareConcurrency == 0)
-    return 1;
-  return HardwareConcurrency;
+  return llvm::heavyweight_hardware_concurrency().compute_thread_count();
 }
 
 FileStatus TUStatus::render(PathRef File) const {
diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp
index ab80113a4a955..c2541237c3c93 100644
--- a/clang-tools-extra/clangd/index/Background.cpp
+++ b/clang-tools-extra/clangd/index/Background.cpp
@@ -148,9 +148,10 @@ BackgroundIndex::BackgroundIndex(
           CDB.watch([&](const std::vector<std::string> &ChangedFiles) {
             enqueue(ChangedFiles);
           })) {
-  assert(ThreadPoolSize > 0 && "Thread pool size can't be zero.");
+  assert(Rebuilder.TUsBeforeFirstBuild > 0 &&
+         "Thread pool size can't be zero.");
   assert(this->IndexStorageFactory && "Storage factory can not be null!");
-  for (unsigned I = 0; I < ThreadPoolSize; ++I) {
+  for (unsigned I = 0; I < Rebuilder.TUsBeforeFirstBuild; ++I) {
     ThreadPool.runAsync("background-worker-" + llvm::Twine(I + 1), [this] {
       WithContext Ctx(this->BackgroundContext.clone());
       Queue.work([&] { Rebuilder.idle(); });
diff --git a/clang-tools-extra/clangd/index/Background.h b/clang-tools-extra/clangd/index/Background.h
index b11008de15d02..2ae11c72d5d43 100644
--- a/clang-tools-extra/clangd/index/Background.h
+++ b/clang-tools-extra/clangd/index/Background.h
@@ -135,7 +135,7 @@ class BackgroundIndex : public SwapIndex {
       Context BackgroundContext, const FileSystemProvider &,
       const GlobalCompilationDatabase &CDB,
       BackgroundIndexStorage::Factory IndexStorageFactory,
-      size_t ThreadPoolSize = llvm::heavyweight_hardware_concurrency(),
+      size_t ThreadPoolSize = 0, // 0 = use all hardware threads
       std::function<void(BackgroundQueue::Stats)> OnProgress = nullptr);
   ~BackgroundIndex(); // Blocks while the current task finishes.
 
diff --git a/clang-tools-extra/clangd/index/BackgroundRebuild.h b/clang-tools-extra/clangd/index/BackgroundRebuild.h
index d74c28be5cfb1..295f705c98e8f 100644
--- a/clang-tools-extra/clangd/index/BackgroundRebuild.h
+++ b/clang-tools-extra/clangd/index/BackgroundRebuild.h
@@ -49,7 +49,9 @@ class BackgroundIndexRebuilder {
 public:
   BackgroundIndexRebuilder(SwapIndex *Target, FileSymbols *Source,
                            unsigned Threads)
-      : TUsBeforeFirstBuild(Threads), Target(Target), Source(Source) {}
+      : TUsBeforeFirstBuild(llvm::heavyweight_hardware_concurrency(Threads)
+                                .compute_thread_count()),
+        Target(Target), Source(Source) {}
 
   // Called to indicate a TU has been indexed.
   // May rebuild, if enough TUs have been indexed.
diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
index 8b9b1d0033a5a..c5dd09b995087 100644
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -13,6 +13,7 @@
 #include "ParsedAST.h"
 #include "Selection.h"
 #include "SourceCode.h"
+#include "Trace.h"
 #include "index/SymbolCollector.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclTemplate.h"
@@ -124,6 +125,7 @@ llvm::Optional<ReasonToReject> renameable(const NamedDecl &RenameDecl,
                                           StringRef MainFilePath,
                                           const SymbolIndex *Index,
                                           bool CrossFile) {
+  trace::Span Tracer("Renameable");
   // Filter out symbols that are unsupported in both rename modes.
   if (llvm::isa<NamespaceDecl>(&RenameDecl))
     return ReasonToReject::UnsupportedSymbol;
@@ -225,6 +227,7 @@ llvm::Error makeError(ReasonToReject Reason) {
 // Return all rename occurrences in the main file.
 std::vector<SourceLocation> findOccurrencesWithinFile(ParsedAST &AST,
                                                       const NamedDecl &ND) {
+  trace::Span Tracer("FindOccurrenceeWithinFile");
   // If the cursor is at the underlying CXXRecordDecl of the
   // ClassTemplateDecl, ND will be the CXXRecordDecl. In this case, we need to
   // get the primary template maunally.
@@ -260,6 +263,7 @@ std::vector<SourceLocation> findOccurrencesWithinFile(ParsedAST &AST,
 llvm::Expected<tooling::Replacements>
 renameWithinFile(ParsedAST &AST, const NamedDecl &RenameDecl,
                  llvm::StringRef NewName) {
+  trace::Span Tracer("RenameWithinFile");
   const SourceManager &SM = AST.getSourceManager();
 
   tooling::Replacements FilteredChanges;
@@ -319,6 +323,7 @@ std::vector<const CXXConstructorDecl *> getConstructors(const NamedDecl *ND) {
 llvm::Expected<llvm::StringMap<std::vector<Range>>>
 findOccurrencesOutsideFile(const NamedDecl &RenameDecl,
                            llvm::StringRef MainFile, const SymbolIndex &Index) {
+  trace::Span Tracer("FindOccurrencesOutsideFile");
   RefsRequest RQuest;
   RQuest.IDs.insert(*getSymbolID(&RenameDecl));
   // Classes and their constructors are different symbols, and have different
@@ -361,6 +366,9 @@ findOccurrencesOutsideFile(const NamedDecl &RenameDecl,
     auto &Ranges = FileAndOccurrences.getValue();
     llvm::sort(Ranges);
     Ranges.erase(std::unique(Ranges.begin(), Ranges.end()), Ranges.end());
+
+    SPAN_ATTACH(Tracer, FileAndOccurrences.first(),
+                static_cast<int64_t>(Ranges.size()));
   }
   return AffectedFiles;
 }
@@ -381,6 +389,7 @@ llvm::Expected<FileEdits> renameOutsideFile(
     const NamedDecl &RenameDecl, llvm::StringRef MainFilePath,
     llvm::StringRef NewName, const SymbolIndex &Index,
     llvm::function_ref<llvm::Expected<std::string>(PathRef)> GetFileContent) {
+  trace::Span Tracer("RenameOutsideFile");
   auto AffectedFiles =
       findOccurrencesOutsideFile(RenameDecl, MainFilePath, Index);
   if (!AffectedFiles)
@@ -463,6 +472,7 @@ void findNearMiss(
 } // namespace
 
 llvm::Expected<FileEdits> rename(const RenameInputs &RInputs) {
+  trace::Span Tracer("Rename flow");
   ParsedAST &AST = RInputs.AST;
   const SourceManager &SM = AST.getSourceManager();
   llvm::StringRef MainFileCode = SM.getBufferData(SM.getMainFileID());
@@ -555,6 +565,11 @@ llvm::Expected<Edit> buildRenameEdit(llvm::StringRef AbsFilePath,
                                      llvm::StringRef InitialCode,
                                      std::vector<Range> Occurrences,
                                      llvm::StringRef NewName) {
+  trace::Span Tracer("BuildRenameEdit");
+  SPAN_ATTACH(Tracer, "file_path", AbsFilePath);
+  SPAN_ATTACH(Tracer, "rename_occurrences",
+              static_cast<int64_t>(Occurrences.size()));
+
   assert(std::is_sorted(Occurrences.begin(), Occurrences.end()));
   assert(std::unique(Occurrences.begin(), Occurrences.end()) ==
              Occurrences.end() &&
@@ -618,6 +633,7 @@ llvm::Expected<Edit> buildRenameEdit(llvm::StringRef AbsFilePath,
 llvm::Optional<std::vector<Range>>
 adjustRenameRanges(llvm::StringRef DraftCode, llvm::StringRef Identifier,
                    std::vector<Range> Indexed, const LangOptions &LangOpts) {
+  trace::Span Tracer("AdjustRenameRanges");
   assert(!Indexed.empty());
   assert(std::is_sorted(Indexed.begin(), Indexed.end()));
   std::vector<Range> Lexed =
@@ -628,12 +644,16 @@ adjustRenameRanges(llvm::StringRef DraftCode, llvm::StringRef Identifier,
 
 llvm::Optional<std::vector<Range>> getMappedRanges(ArrayRef<Range> Indexed,
                                                    ArrayRef<Range> Lexed) {
+  trace::Span Tracer("GetMappedRanges");
   assert(!Indexed.empty());
   assert(std::is_sorted(Indexed.begin(), Indexed.end()));
   assert(std::is_sorted(Lexed.begin(), Lexed.end()));
 
   if (Indexed.size() > Lexed.size()) {
     vlog("The number of lexed occurrences is less than indexed occurrences");
+    SPAN_ATTACH(
+        Tracer, "error",
+        "The number of lexed occurrences is less than indexed occurrences");
     return llvm::None;
   }
   // Fast check for the special subset case.
@@ -660,15 +680,18 @@ llvm::Optional<std::vector<Range>> getMappedRanges(ArrayRef<Range> Indexed,
                });
   if (HasMultiple) {
     vlog("The best near miss is not unique.");
+    SPAN_ATTACH(Tracer, "error", "The best near miss is not unique");
     return llvm::None;
   }
   if (Best.empty()) {
     vlog("Didn't find a near miss.");
+    SPAN_ATTACH(Tracer, "error", "Didn't find a near miss");
     return llvm::None;
   }
   std::vector<Range> Mapped;
   for (auto I : Best)
     Mapped.push_back(Lexed[I]);
+  SPAN_ATTACH(Tracer, "mapped_ranges", static_cast<int64_t>(Mapped.size()));
   return Mapped;
 }
 
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index a39c7431044f4..f9ffe11673380 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -481,7 +481,7 @@ TEST(CompletionTest, Kinds) {
               AllOf(Has("function", CompletionItemKind::Function),
                     Has("variable", CompletionItemKind::Variable),
                     Has("int", CompletionItemKind::Keyword),
-                    Has("Struct", CompletionItemKind::Class),
+                    Has("Struct", CompletionItemKind::Struct),
                     Has("MACRO", CompletionItemKind::Text),
                     Has("indexFunction", CompletionItemKind::Function),
                     Has("indexVariable", CompletionItemKind::Variable),
@@ -529,6 +529,17 @@ TEST(CompletionTest, Kinds) {
           AllOf(Named("complete_variable"), Kind(CompletionItemKind::Variable)),
           AllOf(Named("complete_static_member"),
                 Kind(CompletionItemKind::Property))));
+
+   Results = completions(
+      R"cpp(
+        enum Color {
+          Red
+        };
+        Color u = ^
+      )cpp");
+   EXPECT_THAT(Results.Completions,
+               Contains(
+                   AllOf(Named("Red"), Kind(CompletionItemKind::EnumMember))));
 }
 
 TEST(CompletionTest, NoDuplicates) {
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 2876e2f31c135..503b4d2afa42a 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -573,6 +573,42 @@ class Foo {})cpp";
          // pattern.
          HI.Documentation = "comment from primary";
        }},
+      {// Template Type Parameter
+       R"cpp(
+          template <typename [[^T]] = int> void foo();
+          )cpp",
+       [](HoverInfo &HI) {
+         HI.Name = "T";
+         HI.Kind = index::SymbolKind::TemplateTypeParm;
+         HI.NamespaceScope = "";
+         HI.Definition = "typename T = int";
+         HI.LocalScope = "foo::";
+         HI.Type = "typename";
+       }},
+      {// TemplateTemplate Type Parameter
+       R"cpp(
+          template <template<typename> class [[^T]]> void foo();
+          )cpp",
+       [](HoverInfo &HI) {
+         HI.Name = "T";
+         HI.Kind = index::SymbolKind::TemplateTemplateParm;
+         HI.NamespaceScope = "";
+         HI.Definition = "template <typename> class T";
+         HI.LocalScope = "foo::";
+         HI.Type = "template <typename> class";
+       }},
+      {// NonType Template Parameter
+       R"cpp(
+          template <int [[^T]] = 5> void foo();
+          )cpp",
+       [](HoverInfo &HI) {
+         HI.Name = "T";
+         HI.Kind = index::SymbolKind::NonTypeTemplateParm;
+         HI.NamespaceScope = "";
+         HI.Definition = "int T = 5";
+         HI.LocalScope = "foo::";
+         HI.Type = "int";
+       }},
   };
   for (const auto &Case : Cases) {
     SCOPED_TRACE(Case.Code);
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 6c8c9f8020823..856d5e34bbcc2 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1190,50 +1190,8 @@ installed.
 Controlling Floating Point Behavior
 -----------------------------------
 
-Clang provides a number of ways to control floating point behavior, including
-with command line options and source pragmas. This section
-describes the various floating point semantic modes and the corresponding options.
-
-.. csv-table:: Floating Point Semantic Modes
-  :header: "Mode", "Values"
-  :widths: 15, 30, 30
-
-  "except_behavior", "{ignore, strict, may_trap}", "ffp-exception-behavior"
-  "fenv_access", "{off, on}", "(none)"
-  "rounding_mode", "{dynamic, tonearest, downward, upward, towardzero}", "frounding-math"
-  "contract", "{on, off, fast}", "ffp-contract"
-  "denormal_fp_math", "{IEEE, PreserveSign, PositiveZero}", "fdenormal-fp-math"
-  "denormal_fp32_math", "{IEEE, PreserveSign, PositiveZero}", "fdenormal-fp-math-fp32"
-  "support_math_errno", "{on, off}", "fmath-errno"
-  "no_honor_nans", "{on, off}", "fhonor-nans"
-  "no_honor_infinities", "{on, off}", "fhonor-infinities"
-  "no_signed_zeros", "{on, off}", "fsigned-zeros"
-  "allow_reciprocal", "{on, off}", "freciprocal-math"
-  "allow_approximate_fns", "{on, off}", "(none)"
-  "allow_reassociation", "{on, off}", "fassociative-math"
-
-
-This table describes the option settings that correspond to the three
-floating point semantic models: precise (the default), strict, and fast.
-
-
-.. csv-table:: Floating Point Models
-  :header: "Mode", "Precise", "Strict", "Fast"
-  :widths: 25, 15, 15, 15
-
-  "except_behavior", "ignore", "strict", "ignore"
-  "fenv_access", "off", "on", "off"
-  "rounding_mode", "tonearest", "dynamic", "tonearest"
-  "contract", "on", "off", "fast"
-  "denormal_fp_math", "IEEE", "IEEE", "PreserveSign"
-  "denormal_fp32_math", "IEEE","IEEE", "PreserveSign"
-  "support_math_errno", "on", "on", "off"
-  "no_honor_nans", "off", "off", "on"
-  "no_honor_infinities", "off", "off", "on"
-  "no_signed_zeros", "off", "off", "on"
-  "allow_reciprocal", "off", "off", "on"
-  "allow_approximate_fns", "off", "off", "on"
-  "allow_reassociation", "off", "off", "on"
+Clang provides a number of ways to control floating point behavior. The options
+are listed below.
 
 .. option:: -ffast-math
 
@@ -1427,7 +1385,7 @@ Note that floating-point operations performed as part of constant initialization
    and ``fast``.
    Details:
 
-   * ``precise`` Disables optimizations that are not value-safe on floating-point data, although FP contraction (FMA) is enabled (``-ffp-contract=on``).  This is the default behavior.
+   * ``precise`` Disables optimizations that are not value-safe on floating-point data, although FP contraction (FMA) is enabled (``-ffp-contract=fast``).  This is the default behavior.
    * ``strict`` Enables ``-frounding-math`` and ``-ffp-exception-behavior=strict``, and disables contractions (FMA).  All of the ``-ffast-math`` enablements are disabled.
    * ``fast`` Behaves identically to specifying both ``-ffast-math`` and ``ffp-contract=fast``
 
diff --git a/clang/include/clang-c/BuildSystem.h b/clang/include/clang-c/BuildSystem.h
index 4e9f6dee02795..296e61247cef5 100644
--- a/clang/include/clang-c/BuildSystem.h
+++ b/clang/include/clang-c/BuildSystem.h
@@ -117,7 +117,7 @@ clang_ModuleMapDescriptor_setFrameworkModuleName(CXModuleMapDescriptor,
                                                  const char *name);
 
 /**
- * Sets the umbrealla header name that the module.map describes.
+ * Sets the umbrella header name that the module.map describes.
  * \returns 0 for success, non-zero to indicate an error.
  */
 CINDEX_LINKAGE enum CXErrorCode
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index b653995ebbd01..efb96f3cc5b6b 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -3745,7 +3745,7 @@ CINDEX_LINKAGE unsigned clang_Type_getNumObjCProtocolRefs(CXType T);
 CINDEX_LINKAGE CXCursor clang_Type_getObjCProtocolDecl(CXType T, unsigned i);
 
 /**
- * Retreive the number of type arguments associated with an ObjC object.
+ * Retrieve the number of type arguments associated with an ObjC object.
  *
  * If the type is not an ObjC object, 0 is returned.
  */
diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h
index bbaa46363d971..b2b53e80dc95f 100644
--- a/clang/include/clang/AST/Attr.h
+++ b/clang/include/clang/AST/Attr.h
@@ -17,6 +17,7 @@
 #include "clang/AST/AttrIterator.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/OpenMPClause.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/AttrKinds.h"
 #include "clang/Basic/AttributeCommonInfo.h"
diff --git a/clang/include/clang/AST/DeclObjC.h b/clang/include/clang/AST/DeclObjC.h
index 73dc4ddab8983..954b9bc15789b 100644
--- a/clang/include/clang/AST/DeclObjC.h
+++ b/clang/include/clang/AST/DeclObjC.h
@@ -402,7 +402,7 @@ class ObjCMethodDecl : public NamedDecl, public DeclContext {
   }
 
   /// createImplicitParams - Used to lazily create the self and cmd
-  /// implict parameters. This must be called prior to using getSelfDecl()
+  /// implicit parameters. This must be called prior to using getSelfDecl()
   /// or getCmdDecl(). The call is ignored if the implicit parameters
   /// have already been created.
   void createImplicitParams(ASTContext &Context, const ObjCInterfaceDecl *ID);
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index f103530457ee3..ec470100f4ca2 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -31,6 +31,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPContext.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/TrailingObjects.h"
@@ -6658,6 +6659,53 @@ class OMPClausePrinter final : public OMPClauseVisitor<OMPClausePrinter> {
 #include "clang/Basic/OpenMPKinds.def"
 };
 
+/// Helper data structure representing the traits in a match clause of an
+/// `declare variant` or `metadirective`. The outer level is an ordered
+/// collection of selector sets, each with an associated kind and an ordered
+/// collection of selectors. A selector has a kind, an optional score/condition,
+/// and an ordered collection of properties.
+struct OMPTraitInfo {
+  struct OMPTraitProperty {
+    llvm::omp::TraitProperty Kind = llvm::omp::TraitProperty::invalid;
+  };
+  struct OMPTraitSelector {
+    Expr *ScoreOrCondition = nullptr;
+    llvm::omp::TraitSelector Kind = llvm::omp::TraitSelector::invalid;
+    llvm::SmallVector<OMPTraitProperty, 4> Properties;
+  };
+  struct OMPTraitSet {
+    llvm::omp::TraitSet Kind = llvm::omp::TraitSet::invalid;
+    llvm::SmallVector<OMPTraitSelector, 4> Selectors;
+  };
+
+  /// The outermost level of selector sets.
+  llvm::SmallVector<OMPTraitSet, 4> Sets;
+
+  bool anyScoreOrCondition(
+      llvm::function_ref<bool(Expr *&, bool /* IsScore */)> Cond) {
+    return llvm::any_of(Sets, [Cond](OMPTraitInfo::OMPTraitSet &Set) {
+      return llvm::any_of(
+          Set.Selectors, [Cond](OMPTraitInfo::OMPTraitSelector &Selector) {
+            return Cond(Selector.ScoreOrCondition,
+                        /* IsScore */ Selector.Kind !=
+                            llvm::omp::TraitSelector::user_condition);
+          });
+    });
+  }
+
+  /// Create a variant match info object from this trait info object. While the
+  /// former is a flat representation the actual main difference is that the
+  /// latter uses clang::Expr to store the score/condition while the former is
+  /// independent of clang. Thus, expressions and conditions are evaluated in
+  /// this method.
+  void getAsVariantMatchInfo(ASTContext &ASTCtx,
+                             llvm::omp::VariantMatchInfo &VMI) const;
+
+  /// Print a human readable representation into \p OS.
+  void print(llvm::raw_ostream &OS, const PrintingPolicy &Policy) const;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const OMPTraitInfo &TI);
+
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_OPENMPCLAUSE_H
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 6eeaf05ec71ba..c454c4a80500b 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -180,6 +180,27 @@ class FunctionArgument<string name, bit opt = 0, bit fake = 0> : Argument<name,
 class NamedArgument<string name, bit opt = 0, bit fake = 0> : Argument<name,
                                                                        opt,
                                                                        fake>;
+
+// An argument of a OMPDeclareVariantAttribute that represents the `match`
+// clause of the declare variant by keeping the information (incl. nesting) in
+// an OMPTraitInfo object.
+//
+// With some exceptions, the `match(<context-selector>)` clause looks roughly
+// as follows:
+//   context-selector := list<selector-set>
+//       selector-set := <kind>={list<selector>}
+//           selector := <kind>([score(<const-expr>):] list<trait>)
+//              trait := <kind>
+//
+// The structure of an OMPTraitInfo object is a tree as defined below:
+//
+//   OMPTraitInfo     := {list<OMPTraitSet>}
+//   OMPTraitSet      := {Kind, list<OMPTraitSelector>}
+//   OMPTraitSelector := {Kind, Expr, list<OMPTraitProperty>}
+//   OMPTraitProperty := {Kind}
+//
+class OMPTraitInfoArgument<string name> : Argument<name, 0>;
+
 class TypeArgument<string name, bit opt = 0> : Argument<name, opt>;
 class UnsignedArgument<string name, bit opt = 0> : Argument<name, opt>;
 class VariadicUnsignedArgument<string name> : Argument<name, 1>;
@@ -3705,20 +3726,10 @@ def OMPDeclareVariant : InheritableAttr {
   let Documentation = [OMPDeclareVariantDocs];
   let Args = [
     ExprArgument<"VariantFuncRef">,
-    VariadicExprArgument<"Scores">,
-    VariadicUnsignedArgument<"CtxSelectorSets">,
-    VariadicUnsignedArgument<"CtxSelectors">,
-    VariadicStringArgument<"ImplVendors">,
-    VariadicStringArgument<"DeviceKinds">
+    OMPTraitInfoArgument<"TraitInfos">,
   ];
   let AdditionalMembers = [{
-    void printScore(raw_ostream & OS, const PrintingPolicy &Policy, unsigned I) const {
-      if (const Expr *E = *std::next(scores_begin(), I)) {
-        OS << "score(";
-        E->printPretty(OS, nullptr, Policy);
-        OS << "):";
-      }
-    }
+    ~OMPDeclareVariantAttr() { delete traitInfos; }
     void printPrettyPragma(raw_ostream & OS, const PrintingPolicy &Policy)
         const {
       if (const Expr *E = getVariantFuncRef()) {
@@ -3726,66 +3737,8 @@ def OMPDeclareVariant : InheritableAttr {
         E->printPretty(OS, nullptr, Policy);
         OS << ")";
       }
-      // TODO: add printing of real context selectors.
       OS << " match(";
-      int Used[OMP_CTX_SET_unknown] = {0};
-      for (unsigned I = 0, E = ctxSelectorSets_size(); I < E; ++I) {
-        auto CtxSet = static_cast<OpenMPContextSelectorSetKind>(
-            *std::next(ctxSelectorSets_begin(), I));
-        if (Used[CtxSet])
-          continue;
-        if (I > 0)
-          OS << ",";
-        switch (CtxSet) {
-        case OMP_CTX_SET_implementation:
-          OS << "implementation={";
-          break;
-        case OMP_CTX_SET_device:
-          OS << "device={";
-          break;
-        case OMP_CTX_SET_unknown:
-          llvm_unreachable("Unknown context selector set.");
-        }
-        Used[CtxSet] = 1;
-        for (unsigned K = I, EK = ctxSelectors_size(); K < EK; ++K) {
-          auto CtxSetK = static_cast<OpenMPContextSelectorSetKind>(
-              *std::next(ctxSelectorSets_begin(), K));
-          if (CtxSet != CtxSetK)
-            continue;
-          if (K != I)
-            OS << ",";
-          auto Ctx = static_cast<OpenMPContextSelectorKind>(
-              *std::next(ctxSelectors_begin(), K));
-          switch (Ctx) {
-          case OMP_CTX_vendor:
-            assert(CtxSet == OMP_CTX_SET_implementation &&
-                   "Expected implementation context selector set.");
-            OS << "vendor(";
-            printScore(OS, Policy, K);
-            if (implVendors_size() > 0) {
-              OS << *implVendors(). begin();
-              for (StringRef VendorName : llvm::drop_begin(implVendors(), 1))
-                OS << ", " << VendorName;
-            }
-            OS << ")";
-            break;
-          case OMP_CTX_kind:
-            assert(CtxSet == OMP_CTX_SET_device &&
-                   "Expected device context selector set.");
-            OS << "kind(";
-            if (deviceKinds_size() > 0) {
-              OS << *deviceKinds().begin();
-              for (StringRef KindName : llvm::drop_begin(deviceKinds(), 1))
-                OS << ", " << KindName;
-            }
-            OS << ")";
-            break;
-          case OMP_CTX_unknown:
-            llvm_unreachable("Unknown context selector.");
-          }
-        }
-        OS << "}";
-      }
+      traitInfos->print(OS, Policy);
       OS << ")";
     }
   }];
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 8e62c0e8325d5..9410afb7aa027 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1258,30 +1258,68 @@ def err_omp_mapper_expected_declarator : Error<
   "expected declarator on 'omp declare mapper' directive">;
 def err_omp_declare_variant_wrong_clause : Error<
   "expected '%0' clause on 'omp declare variant' directive">;
-def err_omp_declare_variant_no_ctx_selector : Error<
-  "expected context selector in '%0' clause on 'omp declare variant' directive">;
-def err_omp_declare_variant_equal_expected : Error<
-  "expected '=' after '%0' context selector set name on 'omp declare variant' directive">;
-def warn_omp_declare_variant_cs_name_expected : Warning<
-  "unknown context selector in '%0' context selector set of 'omp declare variant' directive, ignored">,
-  InGroup<OpenMPClauses>;
-def err_omp_declare_variant_item_expected : Error<
-  "expected %0 in '%1' context selector of '%2' selector set of 'omp declare variant' directive">;
-def err_omp_declare_variant_ctx_set_mutiple_use : Error<
-  "context selector set '%0' is used already in the same 'omp declare variant' directive">;
-def note_omp_declare_variant_ctx_set_used_here : Note<
-  "previously context selector set '%0' used here">;
-def err_omp_expected_comma_brace : Error<"expected '}' or ',' after '%0'">;
-def err_omp_declare_variant_ctx_mutiple_use : Error<
-  "context trait selector '%0' is used already in the same '%1' context selector set of 'omp declare variant' directive">;
-def note_omp_declare_variant_ctx_used_here : Note<
-  "previously context trait selector '%0' used here">;
-def warn_omp_more_one_device_type_clause : Warning<
-  "more than one 'device_type' clause is specified">,
-  InGroup<OpenMPClauses>;
-def err_omp_wrong_device_kind_trait : Error<
-  "unknown '%0' device kind trait in the 'device' context selector set, expected"
-  " one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'">;
+def warn_omp_declare_variant_string_literal_or_identifier
+    : Warning<"expected identifier or string literal describing a context "
+              "%select{set|selector|property}0; "
+              "%select{set|selector|property}0 skipped">,
+      InGroup<OpenMPClauses>;
+def note_omp_declare_variant_ctx_options
+    : Note<"context %select{set|selector|property}0 options are: %1">;
+def warn_omp_declare_variant_expected
+    : Warning<"expected '%0' after the %1; '%0' assumed">,
+      InGroup<OpenMPClauses>;
+def warn_omp_declare_variant_ctx_not_a_property
+    : Warning<"'%0' is not a valid context property for the context selector "
+              "'%1' and the context set '%2'; property ignored">,
+      InGroup<OpenMPClauses>;
+def note_omp_declare_variant_ctx_is_a
+    : Note<"'%0' is a context %select{set|selector|property}1 not a context "
+           "%select{set|selector|property}2">;
+def note_omp_declare_variant_ctx_try : Note<"try 'match(%0={%1%2})'">;
+def warn_omp_declare_variant_ctx_not_a_selector
+    : Warning<"'%0' is not a valid context selector for the context set '%1'; "
+              "selector ignored">,
+      InGroup<OpenMPClauses>;
+def warn_omp_declare_variant_ctx_not_a_set
+    : Warning<"'%0' is not a valid context set in a `declare variant`; set "
+              "ignored">,
+      InGroup<OpenMPClauses>;
+def warn_omp_declare_variant_ctx_mutiple_use
+    : Warning<"the context %select{set|selector|property}0 '%1' was used "
+              "already in the same 'omp declare variant' directive; "
+              "%select{set|selector|property}0 ignored">,
+      InGroup<OpenMPClauses>;
+def note_omp_declare_variant_ctx_used_here
+    : Note<"the previous context %select{set|selector|property}0 '%1' used "
+           "here">;
+def note_omp_declare_variant_ctx_continue_here
+    : Note<"the ignored %select{set|selector|property}0 spans until here">;
+def warn_omp_ctx_incompatible_selector_for_set
+    : Warning<"the context selector '%0' is not valid for the context set "
+              "'%1'; selector ignored">,
+      InGroup<OpenMPClauses>;
+def note_omp_ctx_compatible_set_for_selector
+    : Note<"the context selector '%0' can be nested in the context set '%1'; "
+           "try 'match(%1={%0%select{|(property)}2})'">;
+def warn_omp_ctx_selector_without_properties
+    : Warning<"the context selector '%0' in context set '%1' requires a "
+              "context property defined in parentheses; selector ignored">,
+      InGroup<OpenMPClauses>;
+def warn_omp_ctx_incompatible_property_for_selector
+    : Warning<"the context property '%0' is not valid for the context selector "
+              "'%1' and the context set '%2'; property ignored">,
+      InGroup<OpenMPClauses>;
+def note_omp_ctx_compatible_set_and_selector_for_property
+    : Note<"the context property '%0' can be nested in the context selector "
+           "'%1' which is nested in the context set '%2'; try "
+           "'match(%2={%1(%0)})'">;
+def warn_omp_ctx_incompatible_score_for_property
+    : Warning<"the context selector '%0' in the context set '%1' cannot have a "
+              "score ('%2'); score ignored">,
+      InGroup<OpenMPClauses>;
+def warn_omp_more_one_device_type_clause
+    : Warning<"more than one 'device_type' clause is specified">,
+      InGroup<OpenMPClauses>;
 
 // Pragma loop support.
 def err_pragma_loop_missing_argument : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 49c590633767a..3d446ec740fe8 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9951,6 +9951,12 @@ def warn_omp_declare_target_after_first_use : Warning<
   InGroup<OpenMPTarget>;
 def err_omp_declare_variant_incompat_attributes : Error<
   "'#pragma omp declare variant' is not compatible with any target-specific attributes">;
+def warn_omp_declare_variant_score_not_constant
+    : Warning<"score expressions in the OpenMP context selector need to be "
+              "constant; %0 is not and will be ignored">;
+def err_omp_declare_variant_user_condition_not_constant
+    : Error<"the user condition in the OpenMP context selector needs to be "
+            "constant; %0 is not">;
 def warn_omp_declare_variant_after_used : Warning<
   "'#pragma omp declare variant' cannot be applied for function after first "
   "usage; the original function might be used">, InGroup<SourceUsesOpenMP>;
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index dd840b270e636..3ab69a1bb3f1f 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -203,12 +203,6 @@
 #ifndef OPENMP_DECLARE_VARIANT_CLAUSE
 #define OPENMP_DECLARE_VARIANT_CLAUSE(Name)
 #endif
-#ifndef OPENMP_CONTEXT_SELECTOR_SET
-#define OPENMP_CONTEXT_SELECTOR_SET(Name)
-#endif
-#ifndef OPENMP_CONTEXT_SELECTOR
-#define OPENMP_CONTEXT_SELECTOR(Name)
-#endif
 #ifndef OPENMP_LASTPRIVATE_KIND
 #define OPENMP_LASTPRIVATE_KIND(Name)
 #endif
@@ -219,14 +213,6 @@
 #define OPENMP_FLUSH_CLAUSE(Name)
 #endif
 
-// OpenMP context selector sets.
-OPENMP_CONTEXT_SELECTOR_SET(implementation)
-OPENMP_CONTEXT_SELECTOR_SET(device)
-
-// OpenMP context selectors.
-OPENMP_CONTEXT_SELECTOR(vendor)
-OPENMP_CONTEXT_SELECTOR(kind)
-
 // OpenMP clauses.
 OPENMP_CLAUSE(allocator, OMPAllocatorClause)
 OPENMP_CLAUSE(if, OMPIfClause)
@@ -1102,8 +1088,6 @@ OPENMP_FLUSH_CLAUSE(release)
 #undef OPENMP_FLUSH_CLAUSE
 #undef OPENMP_ORDER_KIND
 #undef OPENMP_LASTPRIVATE_KIND
-#undef OPENMP_CONTEXT_SELECTOR
-#undef OPENMP_CONTEXT_SELECTOR_SET
 #undef OPENMP_DECLARE_VARIANT_CLAUSE
 #undef OPENMP_DEVICE_TYPE_KIND
 #undef OPENMP_ALLOCATE_CLAUSE
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index 86c4ad1f754d6..2a08ef6d372aa 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -19,45 +19,6 @@
 
 namespace clang {
 
-/// OpenMP context selector sets.
-enum OpenMPContextSelectorSetKind {
-#define OPENMP_CONTEXT_SELECTOR_SET(Name) OMP_CTX_SET_##Name,
-#include "clang/Basic/OpenMPKinds.def"
-  OMP_CTX_SET_unknown,
-};
-
-/// OpenMP context selectors.
-enum OpenMPContextSelectorKind {
-#define OPENMP_CONTEXT_SELECTOR(Name) OMP_CTX_##Name,
-#include "clang/Basic/OpenMPKinds.def"
-  OMP_CTX_unknown,
-};
-
-OpenMPContextSelectorSetKind getOpenMPContextSelectorSet(llvm::StringRef Str);
-llvm::StringRef
-getOpenMPContextSelectorSetName(OpenMPContextSelectorSetKind Kind);
-OpenMPContextSelectorKind getOpenMPContextSelector(llvm::StringRef Str);
-llvm::StringRef getOpenMPContextSelectorName(OpenMPContextSelectorKind Kind);
-
-/// Struct to store the context selectors info.
-template <typename VectorType, typename ScoreT> struct OpenMPCtxSelectorData {
-  OpenMPContextSelectorSetKind CtxSet = OMP_CTX_SET_unknown;
-  OpenMPContextSelectorKind Ctx = OMP_CTX_unknown;
-  ScoreT Score;
-  VectorType Names;
-  explicit OpenMPCtxSelectorData() = default;
-  explicit OpenMPCtxSelectorData(OpenMPContextSelectorSetKind CtxSet,
-                                 OpenMPContextSelectorKind Ctx,
-                                 const ScoreT &Score, VectorType &&Names)
-      : CtxSet(CtxSet), Ctx(Ctx), Score(Score), Names(Names) {}
-  template <typename U>
-  explicit OpenMPCtxSelectorData(OpenMPContextSelectorSetKind CtxSet,
-                                 OpenMPContextSelectorKind Ctx,
-                                 const ScoreT &Score, const U &Names)
-      : CtxSet(CtxSet), Ctx(Ctx), Score(Score),
-        Names(Names.begin(), Names.end()) {}
-};
-
 /// OpenMP directives.
 using OpenMPDirectiveKind = llvm::omp::Directive;
 
diff --git a/clang/include/clang/Index/IndexSymbol.h b/clang/include/clang/Index/IndexSymbol.h
index 2e1e6005d68a6..de98b8147e8ad 100644
--- a/clang/include/clang/Index/IndexSymbol.h
+++ b/clang/include/clang/Index/IndexSymbol.h
@@ -54,6 +54,9 @@ enum class SymbolKind : uint8_t {
 
   Parameter,
   Using,
+  TemplateTypeParm,
+  TemplateTemplateParm,
+  NonTypeTemplateParm,
 };
 
 enum class SymbolLanguage : uint8_t {
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 5cf1bd457eb07..c8d112054b478 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1701,6 +1701,8 @@ class Parser : public CodeCompletionHandler {
                                   unsigned &NumLineToksConsumed,
                                   bool IsUnevaluated);
 
+  ExprResult ParseStringLiteralExpression(bool AllowUserDefinedLiteral = false);
+
 private:
   ExprResult ParseExpressionWithLeadingAt(SourceLocation AtLoc);
 
@@ -1794,8 +1796,6 @@ class Parser : public CodeCompletionHandler {
                                                   SourceLocation LParenLoc,
                                                   SourceLocation RParenLoc);
 
-  ExprResult ParseStringLiteralExpression(bool AllowUserDefinedLiteral = false);
-
   ExprResult ParseGenericSelectionExpression();
 
   ExprResult ParseObjCBoolLiteral();
@@ -2929,11 +2929,39 @@ class Parser : public CodeCompletionHandler {
   DeclGroupPtrTy ParseOMPDeclareSimdClauses(DeclGroupPtrTy Ptr,
                                             CachedTokens &Toks,
                                             SourceLocation Loc);
-  /// Parses OpenMP context selectors and calls \p Callback for each
-  /// successfully parsed context selector.
-  bool
-  parseOpenMPContextSelectors(SourceLocation Loc,
-                              SmallVectorImpl<Sema::OMPCtxSelectorData> &Data);
+
+  /// Parse a property kind into \p TIProperty for the selector set \p Set and
+  /// selector \p Selector.
+  void parseOMPTraitPropertyKind(OMPTraitInfo::OMPTraitProperty &TIProperty,
+                                 llvm::omp::TraitSet Set,
+                                 llvm::omp::TraitSelector Selector,
+                                 llvm::StringMap<SourceLocation> &Seen);
+
+  /// Parse a selector kind into \p TISelector for the selector set \p Set.
+  void parseOMPTraitSelectorKind(OMPTraitInfo::OMPTraitSelector &TISelector,
+                                 llvm::omp::TraitSet Set,
+                                 llvm::StringMap<SourceLocation> &Seen);
+
+  /// Parse a selector set kind into \p TISet.
+  void parseOMPTraitSetKind(OMPTraitInfo::OMPTraitSet &TISet,
+                            llvm::StringMap<SourceLocation> &Seen);
+
+  /// Parses an OpenMP context property.
+  void parseOMPContextProperty(OMPTraitInfo::OMPTraitSelector &TISelector,
+                               llvm::omp::TraitSet Set,
+                               llvm::StringMap<SourceLocation> &Seen);
+
+  /// Parses an OpenMP context selector.
+  void parseOMPContextSelector(OMPTraitInfo::OMPTraitSelector &TISelector,
+                               llvm::omp::TraitSet Set,
+                               llvm::StringMap<SourceLocation> &SeenSelectors);
+
+  /// Parses an OpenMP context selector set.
+  void parseOMPContextSelectorSet(OMPTraitInfo::OMPTraitSet &TISet,
+                                  llvm::StringMap<SourceLocation> &SeenSets);
+
+  /// Parses OpenMP context selectors.
+  bool parseOMPContextSelectors(SourceLocation Loc, OMPTraitInfo &TI);
 
   /// Parse clauses for '#pragma omp declare variant'.
   void ParseOMPDeclareVariantClauses(DeclGroupPtrTy Ptr, CachedTokens &Toks,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 0bc80daf06993..4ccb92aa22d87 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9869,9 +9869,6 @@ class Sema final {
 
 public:
   /// Struct to store the context selectors info for declare variant directive.
-  using OMPCtxStringType = SmallString<8>;
-  using OMPCtxSelectorData =
-      OpenMPCtxSelectorData<SmallVector<OMPCtxStringType, 4>, ExprResult>;
 
   /// Checks if the variant/multiversion functions are compatible.
   bool areMultiversionVariantFunctionsCompatible(
@@ -10343,10 +10340,12 @@ class Sema final {
   /// applied to.
   /// \param VariantRef Expression that references the variant function, which
   /// must be used instead of the original one, specified in \p DG.
+  /// \param TI The trait info object representing the match clause.
   /// \returns None, if the function/variant function are not compatible with
   /// the pragma, pair of original function/variant ref expression otherwise.
-  Optional<std::pair<FunctionDecl *, Expr *>> checkOpenMPDeclareVariantFunction(
-      DeclGroupPtrTy DG, Expr *VariantRef, SourceRange SR);
+  Optional<std::pair<FunctionDecl *, Expr *>>
+  checkOpenMPDeclareVariantFunction(DeclGroupPtrTy DG, Expr *VariantRef,
+                                    OMPTraitInfo &TI, SourceRange SR);
 
   /// Called on well-formed '\#pragma omp declare variant' after parsing of
   /// the associated method/function.
@@ -10354,11 +10353,9 @@ class Sema final {
   /// applied to.
   /// \param VariantRef Expression that references the variant function, which
   /// must be used instead of the original one, specified in \p DG.
-  /// \param Data Set of context-specific data for the specified context
-  /// selector.
+  /// \param TI The context traits associated with the function variant.
   void ActOnOpenMPDeclareVariantDirective(FunctionDecl *FD, Expr *VariantRef,
-                                          SourceRange SR,
-                                          ArrayRef<OMPCtxSelectorData> Data);
+                                          OMPTraitInfo *TI, SourceRange SR);
 
   OMPClause *ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
                                          Expr *Expr,
diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h
index f6dc8b2b7ae2d..362296024a970 100644
--- a/clang/include/clang/Serialization/ASTRecordReader.h
+++ b/clang/include/clang/Serialization/ASTRecordReader.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/APSInt.h"
 
 namespace clang {
+struct OMPTraitInfo;
 
 /// An object for streaming information from a record.
 class ASTRecordReader
@@ -258,6 +259,9 @@ class ASTRecordReader
     return Reader->ReadCXXTemporary(*F, Record, Idx);
   }
 
+  /// Read an OMPTraitInfo object, advancing Idx.
+  OMPTraitInfo *readOMPTraitInfo();
+
   /// Read an OpenMP clause, advancing Idx.
   OMPClause *readOMPClause();
 
diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h
index 43af68628ecc7..2a35c694ccf8d 100644
--- a/clang/include/clang/Serialization/ASTRecordWriter.h
+++ b/clang/include/clang/Serialization/ASTRecordWriter.h
@@ -266,6 +266,9 @@ class ASTRecordWriter
 
   void AddCXXDefinitionData(const CXXRecordDecl *D);
 
+  /// Write an OMPTraitInfo object.
+  void writeOMPTraitInfo(OMPTraitInfo *TI);
+
   void writeOMPClause(OMPClause *C);
 
   /// Emit a string.
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 6eac98250c8ff..1cd1c82c8f9d4 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1722,3 +1722,107 @@ void OMPClausePrinter::VisitOMPOrderClause(OMPOrderClause *Node) {
   OS << "order(" << getOpenMPSimpleClauseTypeName(OMPC_order, Node->getKind())
      << ")";
 }
+
+void OMPTraitInfo::getAsVariantMatchInfo(
+    ASTContext &ASTCtx, llvm::omp::VariantMatchInfo &VMI) const {
+  for (const OMPTraitSet &Set : Sets) {
+    for (const OMPTraitSelector &Selector : Set.Selectors) {
+
+      // User conditions are special as we evaluate the condition here.
+      if (Selector.Kind == llvm::omp::TraitSelector::user_condition) {
+        assert(Selector.ScoreOrCondition &&
+               "Ill-formed user condition, expected condition expression!");
+        assert(Selector.Properties.size() == 1 &&
+               Selector.Properties.front().Kind ==
+                   llvm::omp::TraitProperty::user_condition_unknown &&
+               "Ill-formed user condition, expected unknown trait property!");
+
+        llvm::APInt CondVal =
+            Selector.ScoreOrCondition->EvaluateKnownConstInt(ASTCtx);
+        VMI.addTrait(CondVal.isNullValue()
+                         ? llvm::omp::TraitProperty::user_condition_false
+                         : llvm::omp::TraitProperty::user_condition_true);
+        continue;
+      }
+
+      llvm::APInt Score;
+      llvm::APInt *ScorePtr = nullptr;
+      if (Selector.ScoreOrCondition) {
+        Score = Selector.ScoreOrCondition->EvaluateKnownConstInt(ASTCtx);
+        ScorePtr = &Score;
+      }
+      for (const OMPTraitProperty &Property : Selector.Properties)
+        VMI.addTrait(Set.Kind, Property.Kind, ScorePtr);
+
+      if (Set.Kind != llvm::omp::TraitSet::construct)
+        continue;
+
+      // TODO: This might not hold once we implement SIMD properly.
+      assert(Selector.Properties.size() == 1 &&
+             Selector.Properties.front().Kind ==
+                 llvm::omp::getOpenMPContextTraitPropertyForSelector(
+                     Selector.Kind) &&
+             "Ill-formed construct selector!");
+
+      VMI.ConstructTraits.push_back(Selector.Properties.front().Kind);
+    }
+  }
+}
+
+void OMPTraitInfo::print(llvm::raw_ostream &OS,
+                         const PrintingPolicy &Policy) const {
+  bool FirstSet = true;
+  for (const OMPTraitInfo::OMPTraitSet &Set : Sets) {
+    if (!FirstSet)
+      OS << ", ";
+    FirstSet = false;
+    OS << llvm::omp::getOpenMPContextTraitSetName(Set.Kind) << "={";
+
+    bool FirstSelector = true;
+    for (const OMPTraitInfo::OMPTraitSelector &Selector : Set.Selectors) {
+      if (!FirstSelector)
+        OS << ", ";
+      FirstSelector = false;
+      OS << llvm::omp::getOpenMPContextTraitSelectorName(Selector.Kind);
+
+      bool AllowsTraitScore = false;
+      bool RequiresProperty = false;
+      llvm::omp::isValidTraitSelectorForTraitSet(
+          Selector.Kind, Set.Kind, AllowsTraitScore, RequiresProperty);
+
+      if (!RequiresProperty)
+        continue;
+
+      OS << "(";
+      if (Selector.Kind == llvm::omp::TraitSelector::user_condition) {
+        Selector.ScoreOrCondition->printPretty(OS, nullptr, Policy);
+      } else {
+
+        if (Selector.ScoreOrCondition) {
+          OS << "score(";
+          Selector.ScoreOrCondition->printPretty(OS, nullptr, Policy);
+          OS << "): ";
+        }
+
+        bool FirstProperty = true;
+        for (const OMPTraitInfo::OMPTraitProperty &Property :
+             Selector.Properties) {
+          if (!FirstProperty)
+            OS << ", ";
+          FirstProperty = false;
+          OS << llvm::omp::getOpenMPContextTraitPropertyName(Property.Kind);
+        }
+      }
+      OS << ")";
+    }
+    OS << "}";
+  }
+}
+
+llvm::raw_ostream &clang::operator<<(llvm::raw_ostream &OS,
+                                     const OMPTraitInfo &TI) {
+  LangOptions LO;
+  PrintingPolicy Policy(LO);
+  TI.print(OS, Policy);
+  return OS;
+}
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 70817f8e464ae..ff0f287003bfc 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -20,49 +20,6 @@
 using namespace clang;
 using namespace llvm::omp;
 
-OpenMPContextSelectorSetKind
-clang::getOpenMPContextSelectorSet(llvm::StringRef Str) {
-  return llvm::StringSwitch<OpenMPContextSelectorSetKind>(Str)
-#define OPENMP_CONTEXT_SELECTOR_SET(Name) .Case(#Name, OMP_CTX_SET_##Name)
-#include "clang/Basic/OpenMPKinds.def"
-      .Default(OMP_CTX_SET_unknown);
-}
-
-llvm::StringRef
-clang::getOpenMPContextSelectorSetName(OpenMPContextSelectorSetKind Kind) {
-  switch (Kind) {
-  case OMP_CTX_SET_unknown:
-    return "unknown";
-#define OPENMP_CONTEXT_SELECTOR_SET(Name)                                      \
-  case OMP_CTX_SET_##Name:                                                     \
-    return #Name;
-#include "clang/Basic/OpenMPKinds.def"
-    break;
-  }
-  llvm_unreachable("Invalid OpenMP context selector set kind");
-}
-
-OpenMPContextSelectorKind clang::getOpenMPContextSelector(llvm::StringRef Str) {
-  return llvm::StringSwitch<OpenMPContextSelectorKind>(Str)
-#define OPENMP_CONTEXT_SELECTOR(Name) .Case(#Name, OMP_CTX_##Name)
-#include "clang/Basic/OpenMPKinds.def"
-      .Default(OMP_CTX_unknown);
-}
-
-llvm::StringRef
-clang::getOpenMPContextSelectorName(OpenMPContextSelectorKind Kind) {
-  switch (Kind) {
-  case OMP_CTX_unknown:
-    return "unknown";
-#define OPENMP_CONTEXT_SELECTOR(Name)                                          \
-  case OMP_CTX_##Name:                                                         \
-    return #Name;
-#include "clang/Basic/OpenMPKinds.def"
-    break;
-  }
-  llvm_unreachable("Invalid OpenMP context selector kind");
-}
-
 OpenMPClauseKind clang::getOpenMPClauseKind(StringRef Str) {
   // 'flush' clause cannot be specified explicitly, because this is an implicit
   // clause for 'flush' directive. If the 'flush' clause is explicitly specified
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index e41d4962f03a7..60b81492f78ea 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -11065,260 +11065,34 @@ Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF,
   return Address(Addr, Align);
 }
 
-namespace {
-using OMPContextSelectorData =
-    OpenMPCtxSelectorData<ArrayRef<StringRef>, llvm::APSInt>;
-using CompleteOMPContextSelectorData = SmallVector<OMPContextSelectorData, 4>;
-} // anonymous namespace
-
-/// Checks current context and returns true if it matches the context selector.
-template <OpenMPContextSelectorSetKind CtxSet, OpenMPContextSelectorKind Ctx,
-          typename... Arguments>
-static bool checkContext(const OMPContextSelectorData &Data,
-                         Arguments... Params) {
-  assert(Data.CtxSet != OMP_CTX_SET_unknown && Data.Ctx != OMP_CTX_unknown &&
-         "Unknown context selector or context selector set.");
-  return false;
-}
-
-/// Checks for implementation={vendor(<vendor>)} context selector.
-/// \returns true iff <vendor>="llvm", false otherwise.
-template <>
-bool checkContext<OMP_CTX_SET_implementation, OMP_CTX_vendor>(
-    const OMPContextSelectorData &Data) {
-  return llvm::all_of(Data.Names,
-                      [](StringRef S) { return !S.compare_lower("llvm"); });
-}
-
-/// Checks for device={kind(<kind>)} context selector.
-/// \returns true if <kind>="host" and compilation is for host.
-/// true if <kind>="nohost" and compilation is for device.
-/// true if <kind>="cpu" and compilation is for Arm, X86 or PPC CPU.
-/// true if <kind>="gpu" and compilation is for NVPTX or AMDGCN.
-/// false otherwise.
-template <>
-bool checkContext<OMP_CTX_SET_device, OMP_CTX_kind, CodeGenModule &>(
-    const OMPContextSelectorData &Data, CodeGenModule &CGM) {
-  for (StringRef Name : Data.Names) {
-    if (!Name.compare_lower("host")) {
-      if (CGM.getLangOpts().OpenMPIsDevice)
-        return false;
-      continue;
-    }
-    if (!Name.compare_lower("nohost")) {
-      if (!CGM.getLangOpts().OpenMPIsDevice)
-        return false;
-      continue;
-    }
-    switch (CGM.getTriple().getArch()) {
-    case llvm::Triple::arm:
-    case llvm::Triple::armeb:
-    case llvm::Triple::aarch64:
-    case llvm::Triple::aarch64_be:
-    case llvm::Triple::aarch64_32:
-    case llvm::Triple::ppc:
-    case llvm::Triple::ppc64:
-    case llvm::Triple::ppc64le:
-    case llvm::Triple::x86:
-    case llvm::Triple::x86_64:
-      if (Name.compare_lower("cpu"))
-        return false;
-      break;
-    case llvm::Triple::amdgcn:
-    case llvm::Triple::nvptx:
-    case llvm::Triple::nvptx64:
-      if (Name.compare_lower("gpu"))
-        return false;
-      break;
-    case llvm::Triple::UnknownArch:
-    case llvm::Triple::arc:
-    case llvm::Triple::avr:
-    case llvm::Triple::bpfel:
-    case llvm::Triple::bpfeb:
-    case llvm::Triple::hexagon:
-    case llvm::Triple::fpga_aoco:
-    case llvm::Triple::fpga_aocr:
-    case llvm::Triple::fpga_aocx:
-    case llvm::Triple::mips:
-    case llvm::Triple::mipsel:
-    case llvm::Triple::mips64:
-    case llvm::Triple::mips64el:
-    case llvm::Triple::msp430:
-    case llvm::Triple::r600:
-    case llvm::Triple::riscv32:
-    case llvm::Triple::riscv64:
-    case llvm::Triple::sparc:
-    case llvm::Triple::sparcv9:
-    case llvm::Triple::sparcel:
-    case llvm::Triple::systemz:
-    case llvm::Triple::tce:
-    case llvm::Triple::tcele:
-    case llvm::Triple::thumb:
-    case llvm::Triple::thumbeb:
-    case llvm::Triple::xcore:
-    case llvm::Triple::le32:
-    case llvm::Triple::le64:
-    case llvm::Triple::amdil:
-    case llvm::Triple::amdil64:
-    case llvm::Triple::hsail:
-    case llvm::Triple::hsail64:
-    case llvm::Triple::spir:
-    case llvm::Triple::spir64:
-    case llvm::Triple::kalimba:
-    case llvm::Triple::shave:
-    case llvm::Triple::lanai:
-    case llvm::Triple::wasm32:
-    case llvm::Triple::wasm64:
-    case llvm::Triple::renderscript32:
-    case llvm::Triple::renderscript64:
-    case llvm::Triple::ve:
-      return false;
-    }
-  }
-  return true;
-}
-
-static bool matchesContext(CodeGenModule &CGM,
-                           const CompleteOMPContextSelectorData &ContextData) {
-  for (const OMPContextSelectorData &Data : ContextData) {
-    switch (Data.Ctx) {
-    case OMP_CTX_vendor:
-      assert(Data.CtxSet == OMP_CTX_SET_implementation &&
-             "Expected implementation context selector set.");
-      if (!checkContext<OMP_CTX_SET_implementation, OMP_CTX_vendor>(Data))
-        return false;
-      break;
-    case OMP_CTX_kind:
-      assert(Data.CtxSet == OMP_CTX_SET_device &&
-             "Expected device context selector set.");
-      if (!checkContext<OMP_CTX_SET_device, OMP_CTX_kind, CodeGenModule &>(Data,
-                                                                           CGM))
-        return false;
-      break;
-    case OMP_CTX_unknown:
-      llvm_unreachable("Unknown context selector kind.");
-    }
-  }
-  return true;
-}
-
-static CompleteOMPContextSelectorData
-translateAttrToContextSelectorData(ASTContext &C,
-                                   const OMPDeclareVariantAttr *A) {
-  CompleteOMPContextSelectorData Data;
-  for (unsigned I = 0, E = A->scores_size(); I < E; ++I) {
-    Data.emplace_back();
-    auto CtxSet = static_cast<OpenMPContextSelectorSetKind>(
-        *std::next(A->ctxSelectorSets_begin(), I));
-    auto Ctx = static_cast<OpenMPContextSelectorKind>(
-        *std::next(A->ctxSelectors_begin(), I));
-    Data.back().CtxSet = CtxSet;
-    Data.back().Ctx = Ctx;
-    const Expr *Score = *std::next(A->scores_begin(), I);
-    Data.back().Score = Score->EvaluateKnownConstInt(C);
-    switch (Ctx) {
-    case OMP_CTX_vendor:
-      assert(CtxSet == OMP_CTX_SET_implementation &&
-             "Expected implementation context selector set.");
-      Data.back().Names =
-          llvm::makeArrayRef(A->implVendors_begin(), A->implVendors_end());
-      break;
-    case OMP_CTX_kind:
-      assert(CtxSet == OMP_CTX_SET_device &&
-             "Expected device context selector set.");
-      Data.back().Names =
-          llvm::makeArrayRef(A->deviceKinds_begin(), A->deviceKinds_end());
-      break;
-    case OMP_CTX_unknown:
-      llvm_unreachable("Unknown context selector kind.");
-    }
-  }
-  return Data;
-}
-
-static bool isStrictSubset(const CompleteOMPContextSelectorData &LHS,
-                           const CompleteOMPContextSelectorData &RHS) {
-  llvm::SmallDenseMap<std::pair<int, int>, llvm::StringSet<>, 4> RHSData;
-  for (const OMPContextSelectorData &D : RHS) {
-    auto &Pair = RHSData.FindAndConstruct(std::make_pair(D.CtxSet, D.Ctx));
-    Pair.getSecond().insert(D.Names.begin(), D.Names.end());
-  }
-  bool AllSetsAreEqual = true;
-  for (const OMPContextSelectorData &D : LHS) {
-    auto It = RHSData.find(std::make_pair(D.CtxSet, D.Ctx));
-    if (It == RHSData.end())
-      return false;
-    if (D.Names.size() > It->getSecond().size())
-      return false;
-    if (llvm::set_union(It->getSecond(), D.Names))
-      return false;
-    AllSetsAreEqual =
-        AllSetsAreEqual && (D.Names.size() == It->getSecond().size());
-  }
-
-  return LHS.size() != RHS.size() || !AllSetsAreEqual;
-}
-
-static bool greaterCtxScore(const CompleteOMPContextSelectorData &LHS,
-                            const CompleteOMPContextSelectorData &RHS) {
-  // Score is calculated as sum of all scores + 1.
-  llvm::APSInt LHSScore(llvm::APInt(64, 1), /*isUnsigned=*/false);
-  bool RHSIsSubsetOfLHS = isStrictSubset(RHS, LHS);
-  if (RHSIsSubsetOfLHS) {
-    LHSScore = llvm::APSInt::get(0);
-  } else {
-    for (const OMPContextSelectorData &Data : LHS) {
-      if (Data.Score.getBitWidth() > LHSScore.getBitWidth()) {
-        LHSScore = LHSScore.extend(Data.Score.getBitWidth()) + Data.Score;
-      } else if (Data.Score.getBitWidth() < LHSScore.getBitWidth()) {
-        LHSScore += Data.Score.extend(LHSScore.getBitWidth());
-      } else {
-        LHSScore += Data.Score;
-      }
-    }
-  }
-  llvm::APSInt RHSScore(llvm::APInt(64, 1), /*isUnsigned=*/false);
-  if (!RHSIsSubsetOfLHS && isStrictSubset(LHS, RHS)) {
-    RHSScore = llvm::APSInt::get(0);
-  } else {
-    for (const OMPContextSelectorData &Data : RHS) {
-      if (Data.Score.getBitWidth() > RHSScore.getBitWidth()) {
-        RHSScore = RHSScore.extend(Data.Score.getBitWidth()) + Data.Score;
-      } else if (Data.Score.getBitWidth() < RHSScore.getBitWidth()) {
-        RHSScore += Data.Score.extend(RHSScore.getBitWidth());
-      } else {
-        RHSScore += Data.Score;
-      }
-    }
-  }
-  return llvm::APSInt::compareValues(LHSScore, RHSScore) >= 0;
-}
-
 /// Finds the variant function that matches current context with its context
 /// selector.
 static const FunctionDecl *getDeclareVariantFunction(CodeGenModule &CGM,
                                                      const FunctionDecl *FD) {
   if (!FD->hasAttrs() || !FD->hasAttr<OMPDeclareVariantAttr>())
     return FD;
-  // Iterate through all DeclareVariant attributes and check context selectors.
-  const OMPDeclareVariantAttr *TopMostAttr = nullptr;
-  CompleteOMPContextSelectorData TopMostData;
+
+  SmallVector<Expr *, 8> VariantExprs;
+  SmallVector<VariantMatchInfo, 8> VMIs;
   for (const auto *A : FD->specific_attrs<OMPDeclareVariantAttr>()) {
-    CompleteOMPContextSelectorData Data =
-        translateAttrToContextSelectorData(CGM.getContext(), A);
-    if (!matchesContext(CGM, Data))
+    const OMPTraitInfo *TI = A->getTraitInfos();
+    if (!TI)
       continue;
-    // If the attribute matches the context, find the attribute with the highest
-    // score.
-    if (!TopMostAttr || !greaterCtxScore(TopMostData, Data)) {
-      TopMostAttr = A;
-      TopMostData.swap(Data);
-    }
+    VMIs.push_back(VariantMatchInfo());
+    TI->getAsVariantMatchInfo(CGM.getContext(), VMIs.back());
+    VariantExprs.push_back(A->getVariantFuncRef());
   }
-  if (!TopMostAttr)
+
+  OMPContext Ctx(CGM.getLangOpts().OpenMPIsDevice, CGM.getTriple());
+  // FIXME: Keep the context in the OMPIRBuilder so we can add constructs as we
+  //        build them.
+
+  int BestMatchIdx = getBestVariantMatchForContext(VMIs, Ctx);
+  if (BestMatchIdx < 0)
     return FD;
+
   return cast<FunctionDecl>(
-      cast<DeclRefExpr>(TopMostAttr->getVariantFuncRef()->IgnoreParenImpCasts())
+      cast<DeclRefExpr>(VariantExprs[BestMatchIdx]->IgnoreParenImpCasts())
           ->getDecl());
 }
 
diff --git a/clang/lib/Driver/ToolChains/Ananas.cpp b/clang/lib/Driver/ToolChains/Ananas.cpp
index 2f11c9739a0eb..10e4ea70db41d 100644
--- a/clang/lib/Driver/ToolChains/Ananas.cpp
+++ b/clang/lib/Driver/ToolChains/Ananas.cpp
@@ -103,7 +103,7 @@ void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (D.isUsingLTO()) {
     assert(!Inputs.empty() && "Must have at least one input.");
-    AddGoldPlugin(ToolChain, Args, CmdArgs, Output, Inputs[0],
+    addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0],
                   D.getLTOMode() == LTOK_Thin);
   }
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 5208a1953d4c3..613d47fb3ad02 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2538,9 +2538,10 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
 
   llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath;
   llvm::DenormalMode DenormalFP32Math = DefaultDenormalFP32Math;
-  StringRef FPContract = "on";
+  StringRef FPContract = "";
   bool StrictFPModel = false;
 
+
   if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
     CmdArgs.push_back(A->getValue());
@@ -2563,6 +2564,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       SignedZeros = true;
       // -fno_fast_math restores default denormal and fpcontract handling
       DenormalFPMath = DefaultDenormalFPMath;
+      FPContract = "";
       StringRef Val = A->getValue();
       if (OFastEnabled && !Val.equals("fast")) {
           // Only -ffp-model=fast is compatible with OFast, ignore.
@@ -2576,10 +2578,12 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       // ffp-model= is a Driver option, it is entirely rewritten into more
       // granular options before being passed into cc1.
       // Use the gcc option in the switch below.
-      if (!FPModel.empty() && !FPModel.equals(Val))
+      if (!FPModel.empty() && !FPModel.equals(Val)) {
         D.Diag(clang::diag::warn_drv_overriding_flag_option)
           << Args.MakeArgString("-ffp-model=" + FPModel)
           << Args.MakeArgString("-ffp-model=" + Val);
+        FPContract = "";
+      }
       if (Val.equals("fast")) {
         optID = options::OPT_ffast_math;
         FPModel = Val;
@@ -2587,7 +2591,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       } else if (Val.equals("precise")) {
         optID = options::OPT_ffp_contract;
         FPModel = Val;
-        FPContract = "on";
+        FPContract = "fast";
         PreciseFPModel = true;
       } else if (Val.equals("strict")) {
         StrictFPModel = true;
@@ -2673,11 +2677,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     case options::OPT_ffp_contract: {
       StringRef Val = A->getValue();
       if (PreciseFPModel) {
-        // When -ffp-model=precise is seen on the command line,
-        // the boolean PreciseFPModel is set to true which indicates
-        // "the current option is actually PreciseFPModel". The optID
-        // is changed to OPT_ffp_contract and FPContract is set to "on".
-        // the argument Val string is "precise": it shouldn't be checked.
+        // -ffp-model=precise enables ffp-contract=fast as a side effect
+        // the FPContract value has already been set to a string literal
+        // and the Val string isn't a pertinent value.
         ;
       } else if (Val.equals("fast") || Val.equals("on") || Val.equals("off"))
         FPContract = Val;
@@ -2774,7 +2776,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       // -fno_fast_math restores default denormal and fpcontract handling
       DenormalFPMath = DefaultDenormalFPMath;
       DenormalFP32Math = DefaultDenormalFP32Math;
-      FPContract = "on";
+      FPContract = "";
       break;
     }
     if (StrictFPModel) {
@@ -2784,7 +2786,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       if (HonorINFs && HonorNaNs &&
         !AssociativeMath && !ReciprocalMath &&
         SignedZeros && TrappingMath && RoundingFPMath &&
-        FPContract.equals("off"))
+        (FPContract.equals("off") || FPContract.empty()))
         // OK: Current Arg doesn't conflict with -ffp-model=strict
         ;
       else {
diff --git a/clang/lib/Driver/ToolChains/CloudABI.cpp b/clang/lib/Driver/ToolChains/CloudABI.cpp
index 77672a99d989c..0602e4f6d0b3d 100644
--- a/clang/lib/Driver/ToolChains/CloudABI.cpp
+++ b/clang/lib/Driver/ToolChains/CloudABI.cpp
@@ -75,7 +75,7 @@ void cloudabi::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (D.isUsingLTO()) {
     assert(!Inputs.empty() && "Must have at least one input.");
-    AddGoldPlugin(ToolChain, Args, CmdArgs, Output, Inputs[0],
+    addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0],
                   D.getLTOMode() == LTOK_Thin);
   }
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index f082bf8ce98b5..ebdb22fae3963 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -361,7 +361,7 @@ bool tools::isUseSeparateSections(const llvm::Triple &Triple) {
   return Triple.getOS() == llvm::Triple::CloudABI;
 }
 
-void tools::AddGoldPlugin(const ToolChain &ToolChain, const ArgList &Args,
+void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
                           ArgStringList &CmdArgs, const InputInfo &Output,
                           const InputInfo &Input, bool IsThinLTO) {
   const char *Linker = Args.MakeArgString(ToolChain.GetLinkerPath());
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index bf1ab8153de78..984f3ee98af1e 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -58,7 +58,7 @@ void SplitDebugInfo(const ToolChain &TC, Compilation &C, const Tool &T,
                     const JobAction &JA, const llvm::opt::ArgList &Args,
                     const InputInfo &Output, const char *OutFile);
 
-void AddGoldPlugin(const ToolChain &ToolChain, const llvm::opt::ArgList &Args,
+void addLTOOptions(const ToolChain &ToolChain, const llvm::opt::ArgList &Args,
                    llvm::opt::ArgStringList &CmdArgs, const InputInfo &Output,
                    const InputInfo &Input, bool IsThinLTO);
 
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index c5c6f530f48c0..3f3d6e7c72eb2 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -275,7 +275,7 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (D.isUsingLTO()) {
     assert(!Inputs.empty() && "Must have at least one input.");
-    AddGoldPlugin(ToolChain, Args, CmdArgs, Output, Inputs[0],
+    addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0],
                   D.getLTOMode() == LTOK_Thin);
   }
 
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 1e1f003daf831..6114829ac8e18 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -111,7 +111,7 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (D.isUsingLTO()) {
     assert(!Inputs.empty() && "Must have at least one input.");
-    AddGoldPlugin(ToolChain, Args, CmdArgs, Output, Inputs[0],
+    addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0],
                   D.getLTOMode() == LTOK_Thin);
   }
 
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index bffaa12cbd8cb..f5ff6795c1f6b 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -544,7 +544,7 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (D.isUsingLTO()) {
     assert(!Inputs.empty() && "Must have at least one input.");
-    AddGoldPlugin(ToolChain, Args, CmdArgs, Output, Inputs[0],
+    addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0],
                   D.getLTOMode() == LTOK_Thin);
   }
 
@@ -2151,6 +2151,7 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
   static const char *const RISCV64Triples[] = {"riscv64-unknown-linux-gnu",
                                                "riscv64-linux-gnu",
                                                "riscv64-unknown-elf",
+                                               "riscv64-redhat-linux",
                                                "riscv64-suse-linux"};
 
   static const char *const SPARCv8LibDirs[] = {"/lib32", "/lib"};
diff --git a/clang/lib/Index/IndexSymbol.cpp b/clang/lib/Index/IndexSymbol.cpp
index ae9134bf11826..0d2e557cdd367 100644
--- a/clang/lib/Index/IndexSymbol.cpp
+++ b/clang/lib/Index/IndexSymbol.cpp
@@ -357,6 +357,15 @@ SymbolInfo index::getSymbolInfo(const Decl *D) {
     case Decl::VarTemplate:
       llvm_unreachable("variables handled before");
       break;
+    case Decl::TemplateTypeParm:
+      Info.Kind = SymbolKind::TemplateTypeParm;
+      break;
+    case Decl::TemplateTemplateParm:
+      Info.Kind = SymbolKind::TemplateTemplateParm;
+      break;
+    case Decl::NonTypeTemplateParm:
+      Info.Kind = SymbolKind::NonTypeTemplateParm;
+      break;
     // Other decls get the 'unknown' kind.
     default:
       break;
@@ -517,6 +526,9 @@ StringRef index::getSymbolKindString(SymbolKind K) {
   case SymbolKind::ConversionFunction: return "conversion-func";
   case SymbolKind::Parameter: return "param";
   case SymbolKind::Using: return "using";
+  case SymbolKind::TemplateTypeParm: return "template-type-param";
+  case SymbolKind::TemplateTemplateParm: return "template-template-param";
+  case SymbolKind::NonTypeTemplateParm: return "non-type-template-param";
   }
   llvm_unreachable("invalid symbol kind");
 }
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index fbabe92977c9d..e1bcbdb05499b 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -19,6 +19,7 @@
 #include "clang/Sema/Scope.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/UniqueVector.h"
+#include "llvm/Frontend/OpenMP/OMPContext.h"
 
 using namespace clang;
 using namespace llvm::omp;
@@ -810,10 +811,225 @@ Parser::ParseOMPDeclareSimdClauses(Parser::DeclGroupPtrTy Ptr,
       LinModifiers, Steps, SourceRange(Loc, EndLoc));
 }
 
+namespace {
+/// Constant used in the diagnostics to distinguish the levels in an OpenMP
+/// contexts: selector-set={selector(trait, ...), ...}, ....
+enum OMPContextLvl {
+  CONTEXT_SELECTOR_SET_LVL = 0,
+  CONTEXT_SELECTOR_LVL = 1,
+  CONTEXT_TRAIT_LVL = 2,
+};
+
+static StringRef stringLiteralParser(Parser &P) {
+  ExprResult Res = P.ParseStringLiteralExpression(true);
+  return Res.isUsable() ? Res.getAs<StringLiteral>()->getString() : "";
+}
+
+static StringRef getNameFromIdOrString(Parser &P, Token &Tok,
+                                       OMPContextLvl Lvl) {
+  if (Tok.is(tok::identifier)) {
+    llvm::SmallString<16> Buffer;
+    StringRef Name = P.getPreprocessor().getSpelling(Tok, Buffer);
+    (void)P.ConsumeToken();
+    return Name;
+  }
+
+  if (tok::isStringLiteral(Tok.getKind()))
+    return stringLiteralParser(P);
+
+  P.Diag(Tok.getLocation(),
+         diag::warn_omp_declare_variant_string_literal_or_identifier)
+      << Lvl;
+  return "";
+}
+
+static bool checkForDuplicates(Parser &P, StringRef Name,
+                               SourceLocation NameLoc,
+                               llvm::StringMap<SourceLocation> &Seen,
+                               OMPContextLvl Lvl) {
+  auto Res = Seen.try_emplace(Name, NameLoc);
+  if (Res.second)
+    return false;
+
+  // Each trait-set-selector-name, trait-selector-name and trait-name can
+  // only be specified once.
+  P.Diag(NameLoc, diag::warn_omp_declare_variant_ctx_mutiple_use)
+      << Lvl << Name;
+  P.Diag(Res.first->getValue(), diag::note_omp_declare_variant_ctx_used_here)
+      << Lvl << Name;
+  return true;
+}
+} // namespace
+
+void Parser::parseOMPTraitPropertyKind(
+    OMPTraitInfo::OMPTraitProperty &TIProperty, llvm::omp::TraitSet Set,
+    llvm::omp::TraitSelector Selector, llvm::StringMap<SourceLocation> &Seen) {
+  TIProperty.Kind = TraitProperty::invalid;
+
+  SourceLocation NameLoc = Tok.getLocation();
+  StringRef Name =
+      getNameFromIdOrString(*this, Tok, CONTEXT_TRAIT_LVL);
+  if (Name.empty()) {
+    Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_options)
+        << CONTEXT_TRAIT_LVL << listOpenMPContextTraitProperties(Set, Selector);
+    return;
+  }
+
+  TIProperty.Kind = getOpenMPContextTraitPropertyKind(Set, Name);
+  if (TIProperty.Kind != TraitProperty::invalid) {
+    if (checkForDuplicates(*this, Name, NameLoc, Seen, CONTEXT_TRAIT_LVL))
+      TIProperty.Kind = TraitProperty::invalid;
+    return;
+  }
+
+  // It follows diagnosis and helping notes.
+  // FIXME: We should move the diagnosis string generation into libFrontend.
+  Diag(NameLoc, diag::warn_omp_declare_variant_ctx_not_a_property)
+      << Name << getOpenMPContextTraitSelectorName(Selector)
+      << getOpenMPContextTraitSetName(Set);
+
+  TraitSet SetForName = getOpenMPContextTraitSetKind(Name);
+  if (SetForName != TraitSet::invalid) {
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a)
+        << Name << CONTEXT_SELECTOR_SET_LVL << CONTEXT_TRAIT_LVL;
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_try)
+        << Name << "<selector-name>"
+        << "(<property-name>)";
+    return;
+  }
+  TraitSelector SelectorForName = getOpenMPContextTraitSelectorKind(Name);
+  if (SelectorForName != TraitSelector::invalid) {
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a)
+        << Name << CONTEXT_SELECTOR_LVL << CONTEXT_TRAIT_LVL;
+    bool AllowsTraitScore = false;
+    bool RequiresProperty = false;
+    isValidTraitSelectorForTraitSet(
+        SelectorForName, getOpenMPContextTraitSetForSelector(SelectorForName),
+        AllowsTraitScore, RequiresProperty);
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_try)
+        << getOpenMPContextTraitSetName(
+               getOpenMPContextTraitSetForSelector(SelectorForName))
+        << Name << (RequiresProperty ? "(<property-name>)" : "");
+    return;
+  }
+  for (const auto &PotentialSet :
+       {TraitSet::construct, TraitSet::user, TraitSet::implementation,
+        TraitSet::device}) {
+    TraitProperty PropertyForName =
+        getOpenMPContextTraitPropertyKind(PotentialSet, Name);
+    if (PropertyForName == TraitProperty::invalid)
+      continue;
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_try)
+        << getOpenMPContextTraitSetName(
+               getOpenMPContextTraitSetForProperty(PropertyForName))
+        << getOpenMPContextTraitSelectorName(
+               getOpenMPContextTraitSelectorForProperty(PropertyForName))
+        << ("(" + Name + ")").str();
+    return;
+  }
+  Diag(NameLoc, diag::note_omp_declare_variant_ctx_options)
+      << CONTEXT_TRAIT_LVL << listOpenMPContextTraitProperties(Set, Selector);
+}
+
+void Parser::parseOMPContextProperty(OMPTraitInfo::OMPTraitSelector &TISelector,
+                                     llvm::omp::TraitSet Set,
+                                     llvm::StringMap<SourceLocation> &Seen) {
+  assert(TISelector.Kind != TraitSelector::user_condition &&
+         "User conditions are special properties not handled here!");
+
+  SourceLocation PropertyLoc = Tok.getLocation();
+  OMPTraitInfo::OMPTraitProperty TIProperty;
+  parseOMPTraitPropertyKind(TIProperty, Set, TISelector.Kind, Seen);
+
+  // If we have an invalid property here we already issued a warning.
+  if (TIProperty.Kind == TraitProperty::invalid) {
+    if (PropertyLoc != Tok.getLocation())
+      Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_continue_here)
+          << CONTEXT_TRAIT_LVL;
+    return;
+  }
+
+  if (isValidTraitPropertyForTraitSetAndSelector(TIProperty.Kind,
+                                                 TISelector.Kind, Set)) {
+    // If we make it here the property, selector, set, score, condition, ... are
+    // all valid (or have been corrected). Thus we can record the property.
+    TISelector.Properties.push_back(TIProperty);
+    return;
+  }
+
+  Diag(PropertyLoc, diag::warn_omp_ctx_incompatible_property_for_selector)
+      << getOpenMPContextTraitPropertyName(TIProperty.Kind)
+      << getOpenMPContextTraitSelectorName(TISelector.Kind)
+      << getOpenMPContextTraitSetName(Set);
+  Diag(PropertyLoc, diag::note_omp_ctx_compatible_set_and_selector_for_property)
+      << getOpenMPContextTraitPropertyName(TIProperty.Kind)
+      << getOpenMPContextTraitSelectorName(
+             getOpenMPContextTraitSelectorForProperty(TIProperty.Kind))
+      << getOpenMPContextTraitSetName(
+             getOpenMPContextTraitSetForProperty(TIProperty.Kind));
+  Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_continue_here)
+      << CONTEXT_TRAIT_LVL;
+}
+
+void Parser::parseOMPTraitSelectorKind(
+    OMPTraitInfo::OMPTraitSelector &TISelector, llvm::omp::TraitSet Set,
+    llvm::StringMap<SourceLocation> &Seen) {
+  TISelector.Kind = TraitSelector::invalid;
+
+  SourceLocation NameLoc = Tok.getLocation();
+  StringRef Name = getNameFromIdOrString(*this, Tok, CONTEXT_SELECTOR_LVL
+                    );
+  if (Name.empty()) {
+    Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_options)
+        << CONTEXT_SELECTOR_LVL << listOpenMPContextTraitSelectors(Set);
+    return;
+  }
+
+  TISelector.Kind = getOpenMPContextTraitSelectorKind(Name);
+  if (TISelector.Kind != TraitSelector::invalid) {
+    if (checkForDuplicates(*this, Name, NameLoc, Seen, CONTEXT_SELECTOR_LVL))
+      TISelector.Kind = TraitSelector::invalid;
+    return;
+  }
+
+  // It follows diagnosis and helping notes.
+  Diag(NameLoc, diag::warn_omp_declare_variant_ctx_not_a_selector)
+      << Name << getOpenMPContextTraitSetName(Set);
+
+  TraitSet SetForName = getOpenMPContextTraitSetKind(Name);
+  if (SetForName != TraitSet::invalid) {
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a)
+        << Name << CONTEXT_SELECTOR_SET_LVL << CONTEXT_SELECTOR_LVL;
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_try)
+        << Name << "<selector-name>"
+        << "<property-name>";
+    return;
+  }
+  for (const auto &PotentialSet :
+       {TraitSet::construct, TraitSet::user, TraitSet::implementation,
+        TraitSet::device}) {
+    TraitProperty PropertyForName =
+        getOpenMPContextTraitPropertyKind(PotentialSet, Name);
+    if (PropertyForName == TraitProperty::invalid)
+      continue;
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a)
+        << Name << CONTEXT_TRAIT_LVL << CONTEXT_SELECTOR_LVL;
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_try)
+        << getOpenMPContextTraitSetName(
+               getOpenMPContextTraitSetForProperty(PropertyForName))
+        << getOpenMPContextTraitSelectorName(
+               getOpenMPContextTraitSelectorForProperty(PropertyForName))
+        << ("(" + Name + ")").str();
+    return;
+  }
+  Diag(NameLoc, diag::note_omp_declare_variant_ctx_options)
+      << CONTEXT_SELECTOR_LVL << listOpenMPContextTraitSelectors(Set);
+}
+
 /// Parse optional 'score' '(' <expr> ')' ':'.
 static ExprResult parseContextScore(Parser &P) {
   ExprResult ScoreExpr;
-  Sema::OMPCtxStringType Buffer;
+  llvm::SmallString<16> Buffer;
   StringRef SelectorName =
       P.getPreprocessor().getSpelling(P.getCurToken(), Buffer);
   if (!SelectorName.equals("score"))
@@ -825,246 +1041,266 @@ static ExprResult parseContextScore(Parser &P) {
   if (P.getCurToken().is(tok::colon))
     (void)P.ConsumeAnyToken();
   else
-    P.Diag(P.getCurToken(), diag::warn_pragma_expected_colon)
-        << "context selector score clause";
+    P.Diag(P.getCurToken(), diag::warn_omp_declare_variant_expected)
+        << "':'"
+        << "score expression";
   return ScoreExpr;
 }
 
-/// Parse context selector for 'implementation' selector set:
-/// 'vendor' '(' [ 'score' '(' <score _expr> ')' ':' ] <vendor> { ',' <vendor> }
-/// ')'
-static void
-parseImplementationSelector(Parser &P, SourceLocation Loc,
-                            llvm::StringMap<SourceLocation> &UsedCtx,
-                            SmallVectorImpl<Sema::OMPCtxSelectorData> &Data) {
-  const Token &Tok = P.getCurToken();
-  // Parse inner context selector set name, if any.
-  if (!Tok.is(tok::identifier)) {
-    P.Diag(Tok.getLocation(), diag::warn_omp_declare_variant_cs_name_expected)
-        << "implementation";
-    // Skip until either '}', ')', or end of directive.
-    while (!P.SkipUntil(tok::r_brace, tok::r_paren,
-                        tok::annot_pragma_openmp_end, Parser::StopBeforeMatch))
-      ;
-    return;
-  }
-  Sema::OMPCtxStringType Buffer;
-  StringRef CtxSelectorName = P.getPreprocessor().getSpelling(Tok, Buffer);
-  auto Res = UsedCtx.try_emplace(CtxSelectorName, Tok.getLocation());
-  if (!Res.second) {
-    // OpenMP 5.0, 2.3.2 Context Selectors, Restrictions.
-    // Each trait-selector-name can only be specified once.
-    P.Diag(Tok.getLocation(), diag::err_omp_declare_variant_ctx_mutiple_use)
-        << CtxSelectorName << "implementation";
-    P.Diag(Res.first->getValue(), diag::note_omp_declare_variant_ctx_used_here)
-        << CtxSelectorName;
-  }
-  OpenMPContextSelectorKind CSKind = getOpenMPContextSelector(CtxSelectorName);
-  (void)P.ConsumeToken();
-  switch (CSKind) {
-  case OMP_CTX_vendor: {
-    // Parse '('.
-    BalancedDelimiterTracker T(P, tok::l_paren, tok::annot_pragma_openmp_end);
-    (void)T.expectAndConsume(diag::err_expected_lparen_after,
-                             CtxSelectorName.data());
-    ExprResult Score = parseContextScore(P);
-    llvm::UniqueVector<Sema::OMPCtxStringType> Vendors;
-    do {
-      // Parse <vendor>.
-      StringRef VendorName;
-      if (Tok.is(tok::identifier)) {
-        Buffer.clear();
-        VendorName = P.getPreprocessor().getSpelling(P.getCurToken(), Buffer);
-        (void)P.ConsumeToken();
-        if (!VendorName.empty())
-          Vendors.insert(VendorName);
-      } else {
-        P.Diag(Tok.getLocation(), diag::err_omp_declare_variant_item_expected)
-            << "vendor identifier"
-            << "vendor"
-            << "implementation";
+/// Parses an OpenMP context selector.
+///
+/// <trait-selector-name> ['('[<trait-score>] <trait-property> [, <t-p>]* ')']
+void Parser::parseOMPContextSelector(
+    OMPTraitInfo::OMPTraitSelector &TISelector, llvm::omp::TraitSet Set,
+    llvm::StringMap<SourceLocation> &SeenSelectors) {
+  unsigned short OuterPC = ParenCount;
+
+  // If anything went wrong we issue an error or warning and then skip the rest
+  // of the selector. However, commas are ambiguous so we look for the nesting
+  // of parentheses here as well.
+  auto FinishSelector = [OuterPC, this]() -> void {
+    bool Done = false;
+    while (!Done) {
+      while (!SkipUntil({tok::r_brace, tok::r_paren, tok::comma,
+                         tok::annot_pragma_openmp_end},
+                        StopBeforeMatch))
+        ;
+      if (Tok.is(tok::r_paren) && OuterPC > ParenCount)
+        (void)ConsumeParen();
+      if (OuterPC <= ParenCount) {
+        Done = true;
+        break;
       }
-      if (!P.TryConsumeToken(tok::comma) && Tok.isNot(tok::r_paren)) {
-        P.Diag(Tok, diag::err_expected_punc)
-            << (VendorName.empty() ? "vendor name" : VendorName);
+      if (!Tok.is(tok::comma) && !Tok.is(tok::r_paren)) {
+        Done = true;
+        break;
       }
-    } while (Tok.is(tok::identifier));
-    // Parse ')'.
-    (void)T.consumeClose();
-    if (!Vendors.empty())
-      Data.emplace_back(OMP_CTX_SET_implementation, CSKind, Score, Vendors);
-    break;
+      (void)ConsumeAnyToken();
+    }
+    Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_continue_here)
+        << CONTEXT_SELECTOR_LVL;
+  };
+
+  SourceLocation SelectorLoc = Tok.getLocation();
+  parseOMPTraitSelectorKind(TISelector, Set, SeenSelectors);
+  if (TISelector.Kind == TraitSelector::invalid)
+    return FinishSelector();
+
+  bool AllowsTraitScore = false;
+  bool RequiresProperty = false;
+  if (!isValidTraitSelectorForTraitSet(TISelector.Kind, Set, AllowsTraitScore,
+                                       RequiresProperty)) {
+    Diag(SelectorLoc, diag::warn_omp_ctx_incompatible_selector_for_set)
+        << getOpenMPContextTraitSelectorName(TISelector.Kind)
+        << getOpenMPContextTraitSetName(Set);
+    Diag(SelectorLoc, diag::note_omp_ctx_compatible_set_for_selector)
+        << getOpenMPContextTraitSelectorName(TISelector.Kind)
+        << getOpenMPContextTraitSetName(
+               getOpenMPContextTraitSetForSelector(TISelector.Kind))
+        << RequiresProperty;
+    return FinishSelector();
+  }
+
+  if (!RequiresProperty) {
+    TISelector.Properties.push_back(
+        {getOpenMPContextTraitPropertyForSelector(TISelector.Kind)});
+    return;
   }
-  case OMP_CTX_kind:
-  case OMP_CTX_unknown:
-    P.Diag(Tok.getLocation(), diag::warn_omp_declare_variant_cs_name_expected)
-        << "implementation";
-    // Skip until either '}', ')', or end of directive.
-    while (!P.SkipUntil(tok::r_brace, tok::r_paren,
-                        tok::annot_pragma_openmp_end, Parser::StopBeforeMatch))
-      ;
+
+  if (!Tok.is(tok::l_paren)) {
+    Diag(SelectorLoc, diag::warn_omp_ctx_selector_without_properties)
+        << getOpenMPContextTraitSelectorName(TISelector.Kind)
+        << getOpenMPContextTraitSetName(Set);
+    return FinishSelector();
+  }
+
+  if (TISelector.Kind == TraitSelector::user_condition) {
+    SourceLocation RLoc;
+    ExprResult Condition = ParseOpenMPParensExpr("user condition", RLoc);
+    if (!Condition.isUsable())
+      return FinishSelector();
+    TISelector.ScoreOrCondition = Condition.get();
+    TISelector.Properties.push_back({TraitProperty::user_condition_unknown});
     return;
   }
+
+  BalancedDelimiterTracker BDT(*this, tok::l_paren,
+                               tok::annot_pragma_openmp_end);
+  // Parse '('.
+  (void)BDT.consumeOpen();
+
+  ExprResult Score = parseContextScore(*this);
+
+  if (!AllowsTraitScore && Score.isUsable()) {
+    Diag(Score.get()->getBeginLoc(),
+         diag::warn_omp_ctx_incompatible_score_for_property)
+        << getOpenMPContextTraitSelectorName(TISelector.Kind)
+        << getOpenMPContextTraitSetName(Set) << Score.get();
+    Score = ExprResult();
+  }
+
+  if (Score.isUsable())
+    TISelector.ScoreOrCondition = Score.get();
+
+  llvm::StringMap<SourceLocation> SeenProperties;
+  do {
+    parseOMPContextProperty(TISelector, Set, SeenProperties);
+  } while (TryConsumeToken(tok::comma));
+
+  // Parse ')'.
+  BDT.consumeClose();
 }
 
-/// Parse context selector for 'device' selector set:
-/// 'kind' '(' <kind> { ',' <kind> } ')'
-static void
-parseDeviceSelector(Parser &P, SourceLocation Loc,
-                    llvm::StringMap<SourceLocation> &UsedCtx,
-                    SmallVectorImpl<Sema::OMPCtxSelectorData> &Data) {
-  const Token &Tok = P.getCurToken();
-  // Parse inner context selector set name, if any.
-  if (!Tok.is(tok::identifier)) {
-    P.Diag(Tok.getLocation(), diag::warn_omp_declare_variant_cs_name_expected)
-        << "device";
-    // Skip until either '}', ')', or end of directive.
-    while (!P.SkipUntil(tok::r_brace, tok::r_paren,
-                        tok::annot_pragma_openmp_end, Parser::StopBeforeMatch))
-      ;
+void Parser::parseOMPTraitSetKind(OMPTraitInfo::OMPTraitSet &TISet,
+                                  llvm::StringMap<SourceLocation> &Seen) {
+  TISet.Kind = TraitSet::invalid;
+
+  SourceLocation NameLoc = Tok.getLocation();
+  StringRef Name = getNameFromIdOrString(*this, Tok, CONTEXT_SELECTOR_SET_LVL
+                   );
+  if (Name.empty()) {
+    Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_options)
+        << CONTEXT_SELECTOR_SET_LVL << listOpenMPContextTraitSets();
     return;
   }
-  Sema::OMPCtxStringType Buffer;
-  StringRef CtxSelectorName = P.getPreprocessor().getSpelling(Tok, Buffer);
-  auto Res = UsedCtx.try_emplace(CtxSelectorName, Tok.getLocation());
-  if (!Res.second) {
-    // OpenMP 5.0, 2.3.2 Context Selectors, Restrictions.
-    // Each trait-selector-name can only be specified once.
-    P.Diag(Tok.getLocation(), diag::err_omp_declare_variant_ctx_mutiple_use)
-        << CtxSelectorName << "device";
-    P.Diag(Res.first->getValue(), diag::note_omp_declare_variant_ctx_used_here)
-        << CtxSelectorName;
-  }
-  OpenMPContextSelectorKind CSKind = getOpenMPContextSelector(CtxSelectorName);
-  (void)P.ConsumeToken();
-  switch (CSKind) {
-  case OMP_CTX_kind: {
-    // Parse '('.
-    BalancedDelimiterTracker T(P, tok::l_paren, tok::annot_pragma_openmp_end);
-    (void)T.expectAndConsume(diag::err_expected_lparen_after,
-                             CtxSelectorName.data());
-    llvm::UniqueVector<Sema::OMPCtxStringType> Kinds;
-    do {
-      // Parse <kind>.
-      StringRef KindName;
-      if (Tok.is(tok::identifier)) {
-        Buffer.clear();
-        KindName = P.getPreprocessor().getSpelling(P.getCurToken(), Buffer);
-        SourceLocation SLoc = P.getCurToken().getLocation();
-        (void)P.ConsumeToken();
-        if (llvm::StringSwitch<bool>(KindName)
-                .Case("host", false)
-                .Case("nohost", false)
-                .Case("cpu", false)
-                .Case("gpu", false)
-                .Case("fpga", false)
-                .Default(true)) {
-          P.Diag(SLoc, diag::err_omp_wrong_device_kind_trait) << KindName;
-        } else {
-          Kinds.insert(KindName);
-        }
-      } else {
-        P.Diag(Tok.getLocation(), diag::err_omp_declare_variant_item_expected)
-            << "'host', 'nohost', 'cpu', 'gpu', or 'fpga'"
-            << "kind"
-            << "device";
+
+  TISet.Kind = getOpenMPContextTraitSetKind(Name);
+  if (TISet.Kind != TraitSet::invalid) {
+    if (checkForDuplicates(*this, Name, NameLoc, Seen,
+                           CONTEXT_SELECTOR_SET_LVL))
+      TISet.Kind = TraitSet::invalid;
+    return;
+  }
+
+  // It follows diagnosis and helping notes.
+  Diag(NameLoc, diag::warn_omp_declare_variant_ctx_not_a_set) << Name;
+
+  TraitSelector SelectorForName = getOpenMPContextTraitSelectorKind(Name);
+  if (SelectorForName != TraitSelector::invalid) {
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a)
+        << Name << CONTEXT_SELECTOR_LVL << CONTEXT_SELECTOR_SET_LVL;
+    bool AllowsTraitScore = false;
+    bool RequiresProperty = false;
+    isValidTraitSelectorForTraitSet(
+        SelectorForName, getOpenMPContextTraitSetForSelector(SelectorForName),
+        AllowsTraitScore, RequiresProperty);
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_try)
+        << getOpenMPContextTraitSetName(
+               getOpenMPContextTraitSetForSelector(SelectorForName))
+        << Name << (RequiresProperty ? "(<property-name>)" : "");
+    return;
+  }
+  for (const auto &PotentialSet :
+       {TraitSet::construct, TraitSet::user, TraitSet::implementation,
+        TraitSet::device}) {
+    TraitProperty PropertyForName =
+        getOpenMPContextTraitPropertyKind(PotentialSet, Name);
+    if (PropertyForName == TraitProperty::invalid)
+      continue;
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a)
+        << Name << CONTEXT_TRAIT_LVL << CONTEXT_SELECTOR_SET_LVL;
+    Diag(NameLoc, diag::note_omp_declare_variant_ctx_try)
+        << getOpenMPContextTraitSetName(
+               getOpenMPContextTraitSetForProperty(PropertyForName))
+        << getOpenMPContextTraitSelectorName(
+               getOpenMPContextTraitSelectorForProperty(PropertyForName))
+        << ("(" + Name + ")").str();
+    return;
+  }
+  Diag(NameLoc, diag::note_omp_declare_variant_ctx_options)
+      << CONTEXT_SELECTOR_SET_LVL << listOpenMPContextTraitSets();
+}
+
+/// Parses an OpenMP context selector set.
+///
+/// <trait-set-selector-name> '=' '{' <trait-selector> [, <trait-selector>]* '}'
+void Parser::parseOMPContextSelectorSet(
+    OMPTraitInfo::OMPTraitSet &TISet,
+    llvm::StringMap<SourceLocation> &SeenSets) {
+  auto OuterBC = BraceCount;
+
+  // If anything went wrong we issue an error or warning and then skip the rest
+  // of the set. However, commas are ambiguous so we look for the nesting
+  // of braces here as well.
+  auto FinishSelectorSet = [this, OuterBC]() -> void {
+    bool Done = false;
+    while (!Done) {
+      while (!SkipUntil({tok::comma, tok::r_brace, tok::r_paren,
+                         tok::annot_pragma_openmp_end},
+                        StopBeforeMatch))
+        ;
+      if (Tok.is(tok::r_brace) && OuterBC > BraceCount)
+        (void)ConsumeBrace();
+      if (OuterBC <= BraceCount) {
+        Done = true;
+        break;
       }
-      if (!P.TryConsumeToken(tok::comma) && Tok.isNot(tok::r_paren)) {
-        P.Diag(Tok, diag::err_expected_punc)
-            << (KindName.empty() ? "kind of device" : KindName);
+      if (!Tok.is(tok::comma) && !Tok.is(tok::r_brace)) {
+        Done = true;
+        break;
       }
-    } while (Tok.is(tok::identifier));
-    // Parse ')'.
-    (void)T.consumeClose();
-    if (!Kinds.empty())
-      Data.emplace_back(OMP_CTX_SET_device, CSKind, ExprResult(), Kinds);
-    break;
+      (void)ConsumeAnyToken();
+    }
+    Diag(Tok.getLocation(), diag::note_omp_declare_variant_ctx_continue_here)
+        << CONTEXT_SELECTOR_SET_LVL;
+  };
+
+  parseOMPTraitSetKind(TISet, SeenSets);
+  if (TISet.Kind == TraitSet::invalid)
+    return FinishSelectorSet();
+
+  // Parse '='.
+  if (!TryConsumeToken(tok::equal))
+    Diag(Tok.getLocation(), diag::warn_omp_declare_variant_expected)
+        << "="
+        << ("context set name \"" + getOpenMPContextTraitSetName(TISet.Kind) +
+            "\"")
+               .str();
+
+  // Parse '{'.
+  if (Tok.is(tok::l_brace)) {
+    (void)ConsumeBrace();
+  } else {
+    Diag(Tok.getLocation(), diag::warn_omp_declare_variant_expected)
+        << "{"
+        << ("'=' that follows the context set name \"" +
+            getOpenMPContextTraitSetName(TISet.Kind) + "\"")
+               .str();
   }
-  case OMP_CTX_vendor:
-  case OMP_CTX_unknown:
-    P.Diag(Tok.getLocation(), diag::warn_omp_declare_variant_cs_name_expected)
-        << "device";
-    // Skip until either '}', ')', or end of directive.
-    while (!P.SkipUntil(tok::r_brace, tok::r_paren,
-                        tok::annot_pragma_openmp_end, Parser::StopBeforeMatch))
-      ;
-    return;
+
+  llvm::StringMap<SourceLocation> SeenSelectors;
+  do {
+    OMPTraitInfo::OMPTraitSelector TISelector;
+    parseOMPContextSelector(TISelector, TISet.Kind, SeenSelectors);
+    if (TISelector.Kind != TraitSelector::invalid &&
+        !TISelector.Properties.empty())
+      TISet.Selectors.push_back(TISelector);
+  } while (TryConsumeToken(tok::comma));
+
+  // Parse '}'.
+  if (Tok.is(tok::r_brace)) {
+    (void)ConsumeBrace();
+  } else {
+    Diag(Tok.getLocation(), diag::warn_omp_declare_variant_expected)
+        << "}"
+        << ("context selectors for the context set \"" +
+            getOpenMPContextTraitSetName(TISet.Kind) + "\"")
+               .str();
   }
 }
 
-/// Parses clauses for 'declare variant' directive.
-/// clause:
-/// <selector_set_name> '=' '{' <context_selectors> '}'
-/// [ ',' <selector_set_name> '=' '{' <context_selectors> '}' ]
-bool Parser::parseOpenMPContextSelectors(
-    SourceLocation Loc, SmallVectorImpl<Sema::OMPCtxSelectorData> &Data) {
-  llvm::StringMap<SourceLocation> UsedCtxSets;
+/// Parse OpenMP context selectors:
+///
+/// <trait-set-selector> [, <trait-set-selector>]*
+bool Parser::parseOMPContextSelectors(SourceLocation Loc, OMPTraitInfo &TI) {
+  llvm::StringMap<SourceLocation> SeenSets;
   do {
-    // Parse inner context selector set name.
-    if (!Tok.is(tok::identifier)) {
-      Diag(Tok.getLocation(), diag::err_omp_declare_variant_no_ctx_selector)
-          << getOpenMPClauseName(OMPC_match);
-      return true;
-    }
-    Sema::OMPCtxStringType Buffer;
-    StringRef CtxSelectorSetName = PP.getSpelling(Tok, Buffer);
-    auto Res = UsedCtxSets.try_emplace(CtxSelectorSetName, Tok.getLocation());
-    if (!Res.second) {
-      // OpenMP 5.0, 2.3.2 Context Selectors, Restrictions.
-      // Each trait-set-selector-name can only be specified once.
-      Diag(Tok.getLocation(), diag::err_omp_declare_variant_ctx_set_mutiple_use)
-          << CtxSelectorSetName;
-      Diag(Res.first->getValue(),
-           diag::note_omp_declare_variant_ctx_set_used_here)
-          << CtxSelectorSetName;
-    }
-    // Parse '='.
-    (void)ConsumeToken();
-    if (Tok.isNot(tok::equal)) {
-      Diag(Tok.getLocation(), diag::err_omp_declare_variant_equal_expected)
-          << CtxSelectorSetName;
-      return true;
-    }
-    (void)ConsumeToken();
-    // TBD: add parsing of known context selectors.
-    // Unknown selector - just ignore it completely.
-    {
-      // Parse '{'.
-      BalancedDelimiterTracker TBr(*this, tok::l_brace,
-                                   tok::annot_pragma_openmp_end);
-      if (TBr.expectAndConsume(diag::err_expected_lbrace_after, "="))
-        return true;
-      OpenMPContextSelectorSetKind CSSKind =
-          getOpenMPContextSelectorSet(CtxSelectorSetName);
-      llvm::StringMap<SourceLocation> UsedCtx;
-      do {
-        switch (CSSKind) {
-        case OMP_CTX_SET_implementation:
-          parseImplementationSelector(*this, Loc, UsedCtx, Data);
-          break;
-        case OMP_CTX_SET_device:
-          parseDeviceSelector(*this, Loc, UsedCtx, Data);
-          break;
-        case OMP_CTX_SET_unknown:
-          // Skip until either '}', ')', or end of directive.
-          while (!SkipUntil(tok::r_brace, tok::r_paren,
-                            tok::annot_pragma_openmp_end, StopBeforeMatch))
-            ;
-          break;
-        }
-        const Token PrevTok = Tok;
-        if (!TryConsumeToken(tok::comma) && Tok.isNot(tok::r_brace))
-          Diag(Tok, diag::err_omp_expected_comma_brace)
-              << (PrevTok.isAnnotation() ? "context selector trait"
-                                         : PP.getSpelling(PrevTok));
-      } while (Tok.is(tok::identifier));
-      // Parse '}'.
-      (void)TBr.consumeClose();
-    }
-    // Consume ','
-    if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::annot_pragma_openmp_end))
-      (void)ExpectAndConsume(tok::comma);
-  } while (Tok.isAnyIdentifier());
+    OMPTraitInfo::OMPTraitSet TISet;
+    parseOMPContextSelectorSet(TISet, SeenSets);
+    if (TISet.Kind != TraitSet::invalid && !TISet.Selectors.empty())
+      TI.Sets.push_back(TISet);
+  } while (TryConsumeToken(tok::comma));
+
   return false;
 }
 
@@ -1102,9 +1338,6 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
     (void)ConsumeAnnotationToken();
     return;
   }
-  Optional<std::pair<FunctionDecl *, Expr *>> DeclVarData =
-      Actions.checkOpenMPDeclareVariantFunction(
-          Ptr, AssociatedFunction.get(), SourceRange(Loc, Tok.getLocation()));
 
   // Parse 'match'.
   OpenMPClauseKind CKind = Tok.isAnnotation()
@@ -1132,24 +1365,27 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
   }
 
   // Parse inner context selectors.
-  SmallVector<Sema::OMPCtxSelectorData, 4> Data;
-  if (!parseOpenMPContextSelectors(Loc, Data)) {
-    // Parse ')'.
-    (void)T.consumeClose();
-    // Need to check for extra tokens.
-    if (Tok.isNot(tok::annot_pragma_openmp_end)) {
-      Diag(Tok, diag::warn_omp_extra_tokens_at_eol)
-          << getOpenMPDirectiveName(OMPD_declare_variant);
-    }
-  }
+  OMPTraitInfo *TI = new OMPTraitInfo();
+  parseOMPContextSelectors(Loc, *TI);
+
+  // Parse ')'
+  (void)T.consumeClose();
+
+  Optional<std::pair<FunctionDecl *, Expr *>> DeclVarData =
+      Actions.checkOpenMPDeclareVariantFunction(
+          Ptr, AssociatedFunction.get(), *TI,
+          SourceRange(Loc, Tok.getLocation()));
 
   // Skip last tokens.
   while (Tok.isNot(tok::annot_pragma_openmp_end))
     ConsumeAnyToken();
-  if (DeclVarData.hasValue())
+  if (DeclVarData.hasValue() && !TI->Sets.empty())
     Actions.ActOnOpenMPDeclareVariantDirective(
-        DeclVarData.getValue().first, DeclVarData.getValue().second,
-        SourceRange(Loc, Tok.getLocation()), Data);
+        DeclVarData.getValue().first, DeclVarData.getValue().second, TI,
+        SourceRange(Loc, Tok.getLocation()));
+  else
+    delete TI;
+
   // Skip the last annot_pragma_openmp_end.
   (void)ConsumeAnnotationToken();
 }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 9feee9dac02d7..1c396c8b66fca 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -14157,11 +14157,7 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
       //   Warn if K&R function is defined without a previous declaration.
       //   This warning is issued only if the definition itself does not provide
       //   a prototype. Only K&R definitions do not provide a prototype.
-      //   An empty list in a function declarator that is part of a definition
-      //   of that function specifies that the function has no parameters
-      //   (C99 6.7.5.3p14)
-      if (!FD->hasWrittenPrototype() && FD->getNumParams() > 0 &&
-          !LangOpts.CPlusPlus) {
+      if (!FD->hasWrittenPrototype()) {
         TypeSourceInfo *TI = FD->getTypeSourceInfo();
         TypeLoc TL = TI->getTypeLoc();
         FunctionTypeLoc FTL = TL.getAsAdjusted<FunctionTypeLoc>();
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 5c79eb26394e0..9b3f5d87742e1 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -5369,7 +5369,8 @@ static void setPrototype(Sema &S, FunctionDecl *FD, FunctionDecl *FDWithProto,
 
 Optional<std::pair<FunctionDecl *, Expr *>>
 Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
-                                        Expr *VariantRef, SourceRange SR) {
+                                        Expr *VariantRef, OMPTraitInfo &TI,
+                                        SourceRange SR) {
   if (!DG || DG.get().isNull())
     return None;
 
@@ -5422,12 +5423,41 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
     return None;
   }
 
+  auto ShouldDelayChecks = [](Expr *&E, bool) {
+    return E && (E->isTypeDependent() || E->isValueDependent() ||
+                 E->containsUnexpandedParameterPack() ||
+                 E->isInstantiationDependent());
+  };
   // Do not check templates, wait until instantiation.
-  if (VariantRef->isTypeDependent() || VariantRef->isValueDependent() ||
-      VariantRef->containsUnexpandedParameterPack() ||
-      VariantRef->isInstantiationDependent() || FD->isDependentContext())
+  if (FD->isDependentContext() || ShouldDelayChecks(VariantRef, false) ||
+      TI.anyScoreOrCondition(ShouldDelayChecks))
     return std::make_pair(FD, VariantRef);
 
+  // Deal with non-constant score and user condition expressions.
+  auto HandleNonConstantScoresAndConditions = [this](Expr *&E,
+                                                     bool IsScore) -> bool {
+    llvm::APSInt Result;
+    if (!E || E->isIntegerConstantExpr(Result, Context))
+      return false;
+
+    if (IsScore) {
+      // We warn on non-constant scores and pretend they were not present.
+      Diag(E->getExprLoc(), diag::warn_omp_declare_variant_score_not_constant)
+          << E;
+      E = nullptr;
+    } else {
+      // We could replace a non-constant user condition with "false" but we
+      // will soon need to handle these anyway for the dynamic version of
+      // OpenMP context selectors.
+      Diag(E->getExprLoc(),
+           diag::err_omp_declare_variant_user_condition_not_constant)
+          << E;
+    }
+    return true;
+  };
+  if (TI.anyScoreOrCondition(HandleNonConstantScoresAndConditions))
+    return None;
+
   // Convert VariantRef expression to the type of the original function to
   // resolve possible conflicts.
   ExprResult VariantRefCast;
@@ -5600,75 +5630,13 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
   return std::make_pair(FD, cast<Expr>(DRE));
 }
 
-void Sema::ActOnOpenMPDeclareVariantDirective(
-    FunctionDecl *FD, Expr *VariantRef, SourceRange SR,
-    ArrayRef<OMPCtxSelectorData> Data) {
-  if (Data.empty())
-    return;
-  SmallVector<Expr *, 4> CtxScores;
-  SmallVector<unsigned, 4> CtxSets;
-  SmallVector<unsigned, 4> Ctxs;
-  SmallVector<StringRef, 4> ImplVendors, DeviceKinds;
-  bool IsError = false;
-  for (const OMPCtxSelectorData &D : Data) {
-    OpenMPContextSelectorSetKind CtxSet = D.CtxSet;
-    OpenMPContextSelectorKind Ctx = D.Ctx;
-    if (CtxSet == OMP_CTX_SET_unknown || Ctx == OMP_CTX_unknown)
-      return;
-    Expr *Score = nullptr;
-    if (D.Score.isUsable()) {
-      Score = D.Score.get();
-      if (!Score->isTypeDependent() && !Score->isValueDependent() &&
-          !Score->isInstantiationDependent() &&
-          !Score->containsUnexpandedParameterPack()) {
-        Score =
-            PerformOpenMPImplicitIntegerConversion(Score->getExprLoc(), Score)
-                .get();
-        if (Score)
-          Score = VerifyIntegerConstantExpression(Score).get();
-      }
-    } else {
-      // OpenMP 5.0, 2.3.3 Matching and Scoring Context Selectors.
-      // The kind, arch, and isa selectors are given the values 2^l, 2^(l+1) and
-      // 2^(l+2), respectively, where l is the number of traits in the construct
-      // set.
-      // TODO: implement correct logic for isa and arch traits.
-      // TODO: take the construct context set into account when it is
-      // implemented.
-      int L = 0; // Currently set the number of traits in construct set to 0,
-                 // since the construct trait set in not supported yet.
-      if (CtxSet == OMP_CTX_SET_device && Ctx == OMP_CTX_kind)
-        Score = ActOnIntegerConstant(SourceLocation(), std::pow(2, L)).get();
-      else
-        Score = ActOnIntegerConstant(SourceLocation(), 0).get();
-    }
-    switch (Ctx) {
-    case OMP_CTX_vendor:
-      assert(CtxSet == OMP_CTX_SET_implementation &&
-             "Expected implementation context selector set.");
-      ImplVendors.append(D.Names.begin(), D.Names.end());
-      break;
-    case OMP_CTX_kind:
-      assert(CtxSet == OMP_CTX_SET_device &&
-             "Expected device context selector set.");
-      DeviceKinds.append(D.Names.begin(), D.Names.end());
-      break;
-    case OMP_CTX_unknown:
-      llvm_unreachable("Unknown context selector kind.");
-    }
-    IsError = IsError || !Score;
-    CtxSets.push_back(CtxSet);
-    Ctxs.push_back(Ctx);
-    CtxScores.push_back(Score);
-  }
-  if (!IsError) {
-    auto *NewAttr = OMPDeclareVariantAttr::CreateImplicit(
-        Context, VariantRef, CtxScores.begin(), CtxScores.size(),
-        CtxSets.begin(), CtxSets.size(), Ctxs.begin(), Ctxs.size(),
-        ImplVendors.begin(), ImplVendors.size(), DeviceKinds.begin(),
-        DeviceKinds.size(), SR);
-    FD->addAttr(NewAttr);
-  }
+void Sema::ActOnOpenMPDeclareVariantDirective(FunctionDecl *FD,
+                                              Expr *VariantRef,
+                                              OMPTraitInfo *TI,
+                                              SourceRange SR) {
+  auto *NewAttr =
+      OMPDeclareVariantAttr::CreateImplicit(Context, VariantRef, TI, SR);
+  FD->addAttr(NewAttr);
 }
 
 void Sema::markOpenMPDeclareVariantFuncsReferenced(SourceLocation Loc,
@@ -10481,7 +10449,6 @@ StmtResult Sema::ActOnOpenMPTeamsDistributeSimdDirective(
     CS->getCapturedDecl()->setNothrow();
   }
 
-
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 0ccd188f58e43..f059de5ee4219 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -394,50 +394,43 @@ static void instantiateOMPDeclareVariantAttr(
     VariantFuncRef = Subst(E);
   }
 
+  // Copy the template version of the OMPTraitInfo and run substitute on all
+  // score and condition expressiosn.
+  OMPTraitInfo *TI = new OMPTraitInfo();
+  *TI = *Attr.getTraitInfos();
+
+  // Try to substitute template parameters in score and condition expressions.
+  auto SubstScoreOrConditionExpr = [&S, Subst](Expr *&E, bool) {
+    if (E) {
+      EnterExpressionEvaluationContext Unevaluated(
+          S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+      ExprResult ER = Subst(E);
+      if (ER.isUsable())
+        E = ER.get();
+      else
+        return true;
+    }
+    return false;
+  };
+  if (TI->anyScoreOrCondition(SubstScoreOrConditionExpr)) {
+    delete TI;
+    return;
+  }
+
   // Check function/variant ref.
   Optional<std::pair<FunctionDecl *, Expr *>> DeclVarData =
-      S.checkOpenMPDeclareVariantFunction(
-          S.ConvertDeclToDeclGroup(New), VariantFuncRef.get(), Attr.getRange());
-  if (!DeclVarData)
+      S.checkOpenMPDeclareVariantFunction(S.ConvertDeclToDeclGroup(New),
+                                          VariantFuncRef.get(), *TI,
+                                          Attr.getRange());
+
+  if (!DeclVarData) {
+    delete TI;
     return;
-  SmallVector<Sema::OMPCtxSelectorData, 4> Data;
-  for (unsigned I = 0, E = Attr.scores_size(); I < E; ++I) {
-    ExprResult Score;
-    if (Expr *E = *std::next(Attr.scores_begin(), I))
-      Score = Subst(E);
-    // Instantiate the attribute.
-    auto CtxSet = static_cast<OpenMPContextSelectorSetKind>(
-        *std::next(Attr.ctxSelectorSets_begin(), I));
-    auto Ctx = static_cast<OpenMPContextSelectorKind>(
-        *std::next(Attr.ctxSelectors_begin(), I));
-    switch (CtxSet) {
-    case OMP_CTX_SET_implementation:
-      switch (Ctx) {
-      case OMP_CTX_vendor:
-        Data.emplace_back(CtxSet, Ctx, Score, Attr.implVendors());
-        break;
-      case OMP_CTX_kind:
-      case OMP_CTX_unknown:
-        llvm_unreachable("Unexpected context selector kind.");
-      }
-      break;
-    case OMP_CTX_SET_device:
-      switch (Ctx) {
-      case OMP_CTX_kind:
-        Data.emplace_back(CtxSet, Ctx, Score, Attr.deviceKinds());
-        break;
-      case OMP_CTX_vendor:
-      case OMP_CTX_unknown:
-        llvm_unreachable("Unexpected context selector kind.");
-      }
-      break;
-    case OMP_CTX_SET_unknown:
-      llvm_unreachable("Unexpected context selector set kind.");
-    }
   }
+
   S.ActOnOpenMPDeclareVariantDirective(DeclVarData.getValue().first,
-                                       DeclVarData.getValue().second,
-                                       Attr.getRange(), Data);
+                                       DeclVarData.getValue().second, TI,
+                                       Attr.getRange());
 }
 
 static void instantiateDependentAMDGPUFlatWorkGroupSizeAttr(
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index a1161d2648387..fbd59b9319535 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12612,3 +12612,22 @@ void OMPClauseReader::VisitOMPOrderClause(OMPOrderClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
   C->setKindKwLoc(Record.readSourceLocation());
 }
+
+OMPTraitInfo *ASTRecordReader::readOMPTraitInfo() {
+  OMPTraitInfo *TI = new OMPTraitInfo();
+  TI->Sets.resize(readUInt32());
+  for (auto &Set : TI->Sets) {
+    Set.Kind = readEnum<llvm::omp::TraitSet>();
+    Set.Selectors.resize(readUInt32());
+    for (auto &Selector : Set.Selectors) {
+      Selector.Kind = readEnum<llvm::omp::TraitSelector>();
+      Selector.ScoreOrCondition = nullptr;
+      if (readBool())
+        Selector.ScoreOrCondition = readExprRef();
+      Selector.Properties.resize(readUInt32());
+      for (auto &Property : Selector.Properties)
+        Property.Kind = readEnum<llvm::omp::TraitProperty>();
+    }
+  }
+  return TI;
+}
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 20a4f78f16e97..45c10be8add72 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2756,6 +2756,8 @@ class AttrReader {
     return Reader.readVersionTuple();
   }
 
+  OMPTraitInfo *readOMPTraitInfo() { return Reader.readOMPTraitInfo(); }
+
   template <typename T> T *GetLocalDeclAs(uint32_t LocalID) {
     return Reader.GetLocalDeclAs<T>(LocalID);
   }
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index f935a69769bf9..018a7386296dc 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6578,3 +6578,19 @@ void OMPClauseWriter::VisitOMPOrderClause(OMPOrderClause *C) {
   Record.AddSourceLocation(C->getKindKwLoc());
 }
 
+void ASTRecordWriter::writeOMPTraitInfo(OMPTraitInfo *TI) {
+  writeUInt32(TI->Sets.size());
+  for (const auto &Set : TI->Sets) {
+    writeEnum(Set.Kind);
+    writeUInt32(Set.Selectors.size());
+    for (const auto &Selector : Set.Selectors) {
+      writeEnum(Selector.Kind);
+      writeBool(Selector.ScoreOrCondition);
+      if (Selector.ScoreOrCondition)
+        writeExprRef(Selector.ScoreOrCondition);
+      writeUInt32(Selector.Properties.size());
+      for (const auto &Property : Selector.Properties)
+        writeEnum(Property.Kind);
+    }
+  }
+}
diff --git a/clang/lib/Tooling/AllTUsExecution.cpp b/clang/lib/Tooling/AllTUsExecution.cpp
index d85075f596079..777857a49e81f 100644
--- a/clang/lib/Tooling/AllTUsExecution.cpp
+++ b/clang/lib/Tooling/AllTUsExecution.cpp
@@ -114,8 +114,7 @@ llvm::Error AllTUsToolExecutor::execute(
   auto &Action = Actions.front();
 
   {
-    llvm::ThreadPool Pool(ThreadCount == 0 ? llvm::hardware_concurrency()
-                                           : ThreadCount);
+    llvm::ThreadPool Pool(llvm::hardware_concurrency(ThreadCount));
     for (std::string File : Files) {
       Pool.async(
           [&](std::string Path) {
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
index b4d5a29ca6959..b1b87e7fa5734 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
@@ -106,7 +106,8 @@ DependencyScanningFilesystemSharedCache::
   // sharding gives a performance edge by reducing the lock contention.
   // FIXME: A better heuristic might also consider the OS to account for
   // the different cost of lock contention on different OSes.
-  NumShards = std::max(2u, llvm::hardware_concurrency() / 4);
+  NumShards =
+      std::max(2u, llvm::hardware_concurrency().compute_thread_count() / 4);
   CacheShards = std::make_unique<CacheShard[]>(NumShards);
 }
 
diff --git a/clang/test/CodeGen/ppc-emmintrin.c b/clang/test/CodeGen/ppc-emmintrin.c
index c14b2dd210f89..631b6c9d2614a 100644
--- a/clang/test/CodeGen/ppc-emmintrin.c
+++ b/clang/test/CodeGen/ppc-emmintrin.c
@@ -2,9 +2,9 @@
 // REQUIRES: powerpc-registered-target
 
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
-// RUN:  -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+// RUN:  -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
-// RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE
 
 // CHECK-BE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 -2139078656>, align 16
 // CHECK-BE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
diff --git a/clang/test/CodeGen/ppc-xmmintrin.c b/clang/test/CodeGen/ppc-xmmintrin.c
index d7499cbedc48d..e9466b32257f0 100644
--- a/clang/test/CodeGen/ppc-xmmintrin.c
+++ b/clang/test/CodeGen/ppc-xmmintrin.c
@@ -2,9 +2,9 @@
 // REQUIRES: powerpc-registered-target
 
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
-// RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
-// RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE
 
 #include <xmmintrin.h>
 
diff --git a/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtbegin.o b/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtend.o b/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crti.o b/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtn.o b/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib/gcc/riscv64-redhat-linux/9/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib64/crt1.o b/clang/test/Driver/Inputs/fedora_31_riscv64_tree/usr/lib64/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/fp-model.c b/clang/test/Driver/fp-model.c
index de83e4e4c9130..8bf53f6d997b3 100644
--- a/clang/test/Driver/fp-model.c
+++ b/clang/test/Driver/fp-model.c
@@ -1,87 +1,85 @@
 // Test that incompatible combinations of -ffp-model= options
 // and other floating point options get a warning diagnostic.
+//
+// REQUIRES: clang-driver
 
-// RUN: %clang -target x86_64 -### -ffp-model=fast -ffp-contract=off -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=fast -ffp-contract=off -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN %s
 // WARN: warning: overriding '-ffp-model=fast' option with '-ffp-contract=off' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=fast -ffp-contract=on -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=fast -ffp-contract=on -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN1 %s
 // WARN1: warning: overriding '-ffp-model=fast' option with '-ffp-contract=on' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -fassociative-math -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -fassociative-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN2 %s
 // WARN2: warning: overriding '-ffp-model=strict' option with '-fassociative-math' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -ffast-math -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -ffast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN3 %s
 // WARN3: warning: overriding '-ffp-model=strict' option with '-ffast-math' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -ffinite-math-only -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -ffinite-math-only -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN4 %s
 // WARN4: warning: overriding '-ffp-model=strict' option with '-ffinite-math-only' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -ffp-contract=fast -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN5 %s
 // WARN5: warning: overriding '-ffp-model=strict' option with '-ffp-contract=fast' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -ffp-contract=fast -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=WARN6 %s
-// WARN6: warning: overriding '-ffp-model=strict' option with '-ffp-contract=fast' [-Woverriding-t-option]
-
-// RUN: %clang -target x86_64 -### -ffp-model=strict -ffp-contract=on -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -ffp-contract=on -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN7 %s
 // WARN7: warning: overriding '-ffp-model=strict' option with '-ffp-contract=on' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -fno-honor-infinities -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -fno-honor-infinities -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN8 %s
 // WARN8: warning: overriding '-ffp-model=strict' option with '-fno-honor-infinities' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -fno-honor-nans -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -fno-honor-nans -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN9 %s
 // WARN9: warning: overriding '-ffp-model=strict' option with '-fno-honor-nans' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -fno-rounding-math -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -fno-rounding-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARNa %s
 // WARNa: warning: overriding '-ffp-model=strict' option with '-fno-rounding-math' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -fno-signed-zeros -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -fno-signed-zeros -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARNb %s
 // WARNb: warning: overriding '-ffp-model=strict' option with '-fno-signed-zeros' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -fno-trapping-math -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -fno-trapping-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARNc %s
 // WARNc: warning: overriding '-ffp-model=strict' option with '-fno-trapping-math' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -freciprocal-math -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -freciprocal-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARNd %s
 // WARNd: warning: overriding '-ffp-model=strict' option with '-freciprocal-math' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -funsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -funsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARNe %s
 // WARNe: warning: overriding '-ffp-model=strict' option with '-funsafe-math-optimizations' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -ffp-model=strict -Ofast -c %s 2>&1 \
+// RUN: %clang -### -ffp-model=strict -Ofast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARNf %s
 // WARNf: warning: overriding '-ffp-model=strict' option with '-Ofast' [-Woverriding-t-option]
 
-// RUN: %clang -target x86_64 -### -c %s 2>&1 \
+// RUN: %clang -### -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NOROUND %s
 // CHECK-NOROUND: "-cc1"
 // CHECK-NOROUND: "-fno-rounding-math"
 
-// RUN: %clang -target x86_64 -### -frounding-math -c %s 2>&1 \
+// RUN: %clang -### -frounding-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ROUND --implicit-check-not ffp-exception-behavior=strict %s
 // CHECK-ROUND: "-cc1"
 // CHECK-ROUND: "-frounding-math"
 
-// RUN: %clang -target x86_64 -### -ftrapping-math -c %s 2>&1 \
+// RUN: %clang -### -ftrapping-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-TRAP %s
 // CHECK-TRAP: "-cc1"
 // CHECK-TRAP: "-ftrapping-math"
 // CHECK-TRAP: "-ffp-exception-behavior=strict"
 
-// RUN: %clang -target x86_64 -### -nostdinc -ffp-model=fast -c %s 2>&1 \
+// RUN: %clang -### -nostdinc -ffp-model=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPM-FAST %s
 // CHECK-FPM-FAST: "-cc1"
 // CHECK-FPM-FAST: "-menable-no-infs"
@@ -95,41 +93,41 @@
 // CHECK-FPM-FAST: "-ffast-math"
 // CHECK-FPM-FAST: "-ffinite-math-only"
 
-// RUN: %clang -target x86_64 -### -nostdinc -ffp-model=precise -c %s 2>&1 \
+// RUN: %clang -### -nostdinc -ffp-model=precise -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPM-PRECISE %s
 // CHECK-FPM-PRECISE: "-cc1"
-// CHECK-FPM-PRECISE: "-ffp-contract=on"
+// CHECK-FPM-PRECISE: "-ffp-contract=fast"
 // CHECK-FPM-PRECISE: "-fno-rounding-math"
 
-// RUN: %clang -target x86_64 -### -nostdinc -ffp-model=strict -c %s 2>&1 \
+// RUN: %clang -### -nostdinc -ffp-model=strict -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPM-STRICT %s
 // CHECK-FPM-STRICT: "-cc1"
 // CHECK-FPM-STRICT: "-ftrapping-math"
-// CHECK-FPM-STRICT: "-ffp-contract=off"
 // CHECK-FPM-STRICT: "-frounding-math"
 // CHECK-FPM-STRICT: "-ffp-exception-behavior=strict"
 
-// RUN: %clang -target x86_64 -### -nostdinc -ftrapping-math -ffp-exception-behavior=ignore -c %s 2>&1 \
+// RUN: %clang -### -nostdinc -ftrapping-math -ffp-exception-behavior=ignore -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-TRAP-IGNORE %s
 // CHECK-TRAP-IGNORE: "-cc1"
 // CHECK-TRAP-IGNORE: "-fno-rounding-math"
 // CHECK-TRAP-IGNORE: "-ffp-exception-behavior=ignore"
 
 
-// RUN: %clang -target x86_64 -### -nostdinc -ffp-exception-behavior=strict -c %s 2>&1 \
+// RUN: %clang -### -nostdinc -ffp-exception-behavior=strict -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FEB-STRICT %s
 // CHECK-FEB-STRICT: "-cc1"
 // CHECK-FEB-STRICT: "-fno-rounding-math"
 // CHECK-FEB-STRICT: "-ffp-exception-behavior=strict"
 
-// RUN: %clang -target x86_64 -### -nostdinc -ffp-exception-behavior=maytrap -c %s 2>&1 \
+// RUN: %clang -### -nostdinc -ffp-exception-behavior=maytrap -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FEB-MAYTRAP %s
 // CHECK-FEB-MAYTRAP: "-cc1"
 // CHECK-FEB-MAYTRAP: "-fno-rounding-math"
 // CHECK-FEB-MAYTRAP: "-ffp-exception-behavior=maytrap"
 
-// RUN: %clang -target x86_64 -### -nostdinc -ffp-exception-behavior=ignore -c %s 2>&1 \
+// RUN: %clang -### -nostdinc -ffp-exception-behavior=ignore -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FEB-IGNORE %s
 // CHECK-FEB-IGNORE: "-cc1"
 // CHECK-FEB-IGNORE: "-fno-rounding-math"
 // CHECK-FEB-IGNORE: "-ffp-exception-behavior=ignore"
+
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index 51227550b528d..ec539522c25dc 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -769,6 +769,21 @@
 // CHECK-FEDORA-21-AARCH64: "{{.*}}/usr/lib/gcc/aarch64-redhat-linux/4.9.0{{/|\\\\}}crtend.o"
 // CHECK-FEDORA-21-AARCH64: "{{.*}}/usr/lib/gcc/aarch64-redhat-linux/4.9.0/../../../../lib64{{/|\\\\}}crtn.o"
 //
+// Check Fedora 31 on riscv64.
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     --target=riscv64-redhat-linux -rtlib=platform \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/fedora_31_riscv64_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-FEDORA-31-RISCV64 %s
+// CHECK-FEDORA-31-RISCV64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-FEDORA-31-RISCV64: "{{.*}}/usr/lib/gcc/riscv64-redhat-linux/9/../../../../lib64{{/|\\\\}}crt1.o"
+// CHECK-FEDORA-31-RISCV64: "{{.*}}/usr/lib/gcc/riscv64-redhat-linux/9{{/|\\\\}}crti.o"
+// CHECK-FEDORA-31-RISCV64: "{{.*}}/usr/lib/gcc/riscv64-redhat-linux/9{{/|\\\\}}crtbegin.o"
+// CHECK-FEDORA-31-RISCV64: "-L[[SYSROOT]]/usr/lib/gcc/riscv64-redhat-linux/9"
+// CHECK-FEDORA-31-RISCV64: "-L[[SYSROOT]]/usr/lib/gcc/riscv64-redhat-linux/9/../../../../lib64"
+// CHECK-FEDORA-31-RISCV64: "{{.*}}/usr/lib/gcc/riscv64-redhat-linux/9{{/|\\\\}}crtend.o"
+// CHECK-FEDORA-31-RISCV64: "{{.*}}/usr/lib/gcc/riscv64-redhat-linux/9{{/|\\\\}}crtn.o"
+//
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=arm-unknown-linux-gnueabi -rtlib=platform \
 // RUN:     --gcc-toolchain="" \
diff --git a/clang/test/OpenMP/declare_variant_ast_print.c b/clang/test/OpenMP/declare_variant_ast_print.c
index 0173626a79085..515d3167627c4 100644
--- a/clang/test/OpenMP/declare_variant_ast_print.c
+++ b/clang/test/OpenMP/declare_variant_ast_print.c
@@ -8,7 +8,7 @@ int foo(void);
 
 #pragma omp declare variant(foo) match(xxx={}, yyy={ccc})
 #pragma omp declare variant(foo) match(xxx={vvv})
-#pragma omp declare variant(foo) match(implementation={vendor(llvm)}, device={kind(fpga)})
+#pragma omp declare variant(foo) match(implementation={vendor(score(0):llvm)}, device={kind(fpga)})
 #pragma omp declare variant(foo) match(implementation={vendor(llvm), xxx})
 #pragma omp declare variant(foo) match(implementation={vendor(unknown)}, device={kind(gpu)})
 #pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm, xxx, ibm)}, device={kind(cpu, nohost)})
@@ -19,8 +19,8 @@ int bar(void);
 // CHECK:      int foo();
 // CHECK-NEXT: #pragma omp declare variant(foo) match(device={kind(nohost)})
 // CHECK-NEXT: #pragma omp declare variant(foo) match(device={kind(host)})
-// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(5):ibm, xxx)},device={kind(cpu, nohost)})
-// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(0):unknown)},device={kind(gpu)})
-// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(0):llvm)})
-// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(0):llvm)},device={kind(fpga)})
+// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm)}, device={kind(cpu, nohost)})
+// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(unknown)}, device={kind(gpu)})
+// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(llvm)})
+// CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={vendor(score(0): llvm)}, device={kind(fpga)})
 // CHECK-NEXT: int bar();
diff --git a/clang/test/OpenMP/declare_variant_ast_print.cpp b/clang/test/OpenMP/declare_variant_ast_print.cpp
index 4964c692166fa..fdc6d18ca1340 100644
--- a/clang/test/OpenMP/declare_variant_ast_print.cpp
+++ b/clang/test/OpenMP/declare_variant_ast_print.cpp
@@ -17,36 +17,40 @@ T foofoo() { return T(); }
 // CHECK-NEXT: return int();
 // CHECK-NEXT: }
 
-// CHECK:      #pragma omp declare variant(foofoo<int>) match(implementation={vendor(score(5):ibm)},device={kind(fpga)})
-// CHECK-NEXT: #pragma omp declare variant(foofoo<int>) match(implementation={vendor(score(0):unknown)})
-// CHECK-NEXT: #pragma omp declare variant(foofoo<int>) match(implementation={vendor(score(0):llvm)},device={kind(cpu)})
+// CHECK:      #pragma omp declare variant(foofoo<int>) match(implementation={vendor(score(5): ibm)}, device={kind(fpga)})
+// CHECK-NEXT: #pragma omp declare variant(foofoo<int>) match(implementation={vendor(unknown)})
+// CHECK-NEXT: #pragma omp declare variant(foofoo<int>) match(implementation={vendor(score(0): llvm)}, device={kind(cpu)})
 // CHECK-NEXT: int bar();
 #pragma omp declare variant(foofoo <int>) match(xxx = {})
 #pragma omp declare variant(foofoo <int>) match(xxx = {vvv})
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(llvm), xxx}, device={kind(cpu)})
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(unknown)})
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(5): ibm)}, device={kind(fpga)})
+#pragma omp declare variant(foofoo <int>) match(implementation = {vendor(score(0): "llvm"), xxx}, device = {kind(cpu)})
+#pragma omp declare variant(foofoo <int>) match(implementation = {vendor("unknown")})
+#pragma omp declare variant(foofoo <int>) match(implementation = {vendor(score(5): ibm)}, device = {kind(fpga)})
 int bar();
 
-// CHECK:      #pragma omp declare variant(foofoo<T>) match(implementation={vendor(score(C + 5):ibm, xxx)},device={kind(cpu, host)})
-// CHECK-NEXT: #pragma omp declare variant(foofoo<T>) match(implementation={vendor(score(0):unknown)})
-// CHECK-NEXT: #pragma omp declare variant(foofoo<T>) match(implementation={vendor(score(0):llvm)},device={kind(cpu)})
+// CHECK:      #pragma omp declare variant(foofoo<T>) match(implementation={vendor(score(C + 5): ibm)}, device={kind(cpu, host)})
+// CHECK-NEXT: #pragma omp declare variant(foofoo<T>) match(implementation={vendor(unknown)})
+// CHECK-NEXT: #pragma omp declare variant(foofoo<T>) match(implementation={vendor(llvm)}, device={kind(cpu)})
+// CHECK-NEXT: #pragma omp declare variant(foofoo<T>) match(user={condition(false)})
+// CHECK-NEXT: #pragma omp declare variant(foofoo<T>) match(user={condition(true)})
 // CHECK-NEXT: template <typename T, int C> T barbar();
 #pragma omp declare variant(foofoo <T>) match(xxx = {})
 #pragma omp declare variant(foofoo <T>) match(xxx = {vvv})
-#pragma omp declare variant(foofoo <T>) match(user = {score(<expr>) : condition(<expr>)})
-#pragma omp declare variant(foofoo <T>) match(user = {score(<expr>) : condition(<expr>)})
-#pragma omp declare variant(foofoo <T>) match(user = {condition(<expr>)})
-#pragma omp declare variant(foofoo <T>) match(user = {condition(<expr>)})
-#pragma omp declare variant(foofoo <T>) match(implementation={vendor(llvm)},device={kind(cpu)})
+#pragma omp declare variant(foofoo <T>) match(user = {score(1 * 1 + 1) : condition(100 > 10 + 2)})
+#pragma omp declare variant(foofoo <T>) match(user = {score(0) : condition(0)})
+#pragma omp declare variant(foofoo <T>) match(user = {condition(true)})
+#pragma omp declare variant(foofoo <T>) match(user = {condition(false)})
+#pragma omp declare variant(foofoo <T>) match(implementation = {vendor(llvm)}, device = {kind(cpu)})
 #pragma omp declare variant(foofoo <T>) match(implementation={vendor(unknown)})
 #pragma omp declare variant(foofoo <T>) match(implementation={vendor(score(C+5): ibm, xxx, ibm)},device={kind(cpu,host)})
 template <typename T, int C>
 T barbar();
 
-// CHECK:      #pragma omp declare variant(foofoo<int>) match(implementation={vendor(score(3 + 5):ibm, xxx)},device={kind(cpu, host)})
-// CHECK-NEXT: #pragma omp declare variant(foofoo<int>) match(implementation={vendor(score(0):unknown)})
-// CHECK-NEXT: #pragma omp declare variant(foofoo<int>) match(implementation={vendor(score(0):llvm)},device={kind(cpu)})
+// CHECK:      #pragma omp declare variant(foofoo<int>) match(implementation={vendor(score(3 + 5): ibm)}, device={kind(cpu, host)})
+// CHECK-NEXT: #pragma omp declare variant(foofoo<int>) match(implementation={vendor(unknown)})
+// CHECK-NEXT: #pragma omp declare variant(foofoo<int>) match(implementation={vendor(llvm)}, device={kind(cpu)})
+// CHECK-NEXT: #pragma omp declare variant(foofoo<int>) match(user={condition(false)})
+// CHECK-NEXT: #pragma omp declare variant(foofoo<int>) match(user={condition(true)})
 // CHECK-NEXT: template<> int barbar<int, 3>();
 
 // CHECK-NEXT: int baz() {
@@ -66,19 +70,19 @@ template <class C>
 void h_ref(C *hp, C *hp2, C *hq, C *lin) {
 }
 
-// CHECK:      #pragma omp declare variant(h_ref<C>) match(implementation={vendor(score(0):unknown)},device={kind(nohost)})
-// CHECK-NEXT: #pragma omp declare variant(h_ref<C>) match(implementation={vendor(score(0):llvm)},device={kind(gpu)})
+// CHECK:      #pragma omp declare variant(h_ref<C>) match(implementation={vendor(unknown)}, device={kind(nohost)})
+// CHECK-NEXT: #pragma omp declare variant(h_ref<C>) match(implementation={vendor(llvm)}, device={kind(gpu)})
 // CHECK-NEXT: template <class C> void h(C *hp, C *hp2, C *hq, C *lin) {
 // CHECK-NEXT: }
 #pragma omp declare variant(h_ref <C>) match(xxx = {})
-#pragma omp declare variant(h_ref <C>) match(implementation={vendor(llvm)}, device={kind(gpu)})
-#pragma omp declare variant(h_ref <C>) match(implementation={vendor(unknown)},device={kind(nohost)})
+#pragma omp declare variant(h_ref <C>) match(implementation = {vendor(llvm)}, device = {kind(gpu)})
+#pragma omp declare variant(h_ref <C>) match(implementation = {vendor(unknown)}, device = {kind(nohost)})
 template <class C>
 void h(C *hp, C *hp2, C *hq, C *lin) {
 }
 
-// CHECK:      #pragma omp declare variant(h_ref<float>) match(implementation={vendor(score(0):unknown)},device={kind(nohost)})
-// CHECK-NEXT: #pragma omp declare variant(h_ref<float>) match(implementation={vendor(score(0):llvm)},device={kind(gpu)})
+// CHECK:      #pragma omp declare variant(h_ref<float>) match(implementation={vendor(unknown)}, device={kind(nohost)})
+// CHECK-NEXT: #pragma omp declare variant(h_ref<float>) match(implementation={vendor(llvm)}, device={kind(gpu)})
 // CHECK-NEXT: template<> void h<float>(float *hp, float *hp2, float *hq, float *lin) {
 // CHECK-NEXT: }
 
@@ -86,7 +90,7 @@ void h(C *hp, C *hp2, C *hq, C *lin) {
 // CHECK-NEXT:   h((float *)hp, (float *)hp2, (float *)hq, (float *)lin);
 // CHECK-NEXT: }
 #pragma omp declare variant(h_ref <double>) match(xxx = {})
-#pragma omp declare variant(h_ref <double>) match(implementation={vendor(ibm)},device={kind(cpu,gpu)})
+#pragma omp declare variant(h_ref <double>) match(implementation = {vendor(ibm)}, device = {kind(cpu, gpu)})
 #pragma omp declare variant(h_ref <double>) match(implementation={vendor(unknown)})
 template <>
 void h(double *hp, double *hp2, double *hq, double *lin) {
@@ -97,36 +101,36 @@ void h(double *hp, double *hp2, double *hq, double *lin) {
 int fn();
 // CHECK: int fn(int);
 int fn(int);
-// CHECK:      #pragma omp declare variant(fn) match(implementation={vendor(score(0):unknown)},device={kind(cpu, gpu)})
-// CHECK-NEXT: #pragma omp declare variant(fn) match(implementation={vendor(score(0):llvm)})
+// CHECK:      #pragma omp declare variant(fn) match(implementation={vendor(unknown)}, device={kind(cpu, gpu)})
+// CHECK-NEXT: #pragma omp declare variant(fn) match(implementation={vendor(llvm)})
 // CHECK-NEXT: int overload();
 #pragma omp declare variant(fn) match(xxx = {})
 #pragma omp declare variant(fn) match(implementation={vendor(llvm)})
-#pragma omp declare variant(fn) match(implementation={vendor(unknown)},device={kind(cpu,gpu)})
+#pragma omp declare variant(fn) match(implementation = {vendor(unknown)}, device = {kind(cpu, gpu)})
 int overload(void);
 
 // CHECK:      int fn_deduced_variant() {
 // CHECK-NEXT: return 0;
 // CHECK-NEXT: }
 auto fn_deduced_variant() { return 0; }
-// CHECK:      #pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(score(0):unknown)},device={kind(gpu, nohost)})
-// CHECK-NEXT: #pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(score(0):llvm)},device={kind(cpu, host)})
+// CHECK:      #pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(unknown)}, device={kind(gpu, nohost)})
+// CHECK-NEXT: #pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(llvm)}, device={kind(cpu, host)})
 // CHECK-NEXT: int fn_deduced();
 #pragma omp declare variant(fn_deduced_variant) match(xxx = {})
-#pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(llvm)},device={kind(cpu,host)})
-#pragma omp declare variant(fn_deduced_variant) match(implementation={vendor(unknown)},device={kind(gpu,nohost)})
+#pragma omp declare variant(fn_deduced_variant) match(implementation = {vendor(llvm)}, device = {kind(cpu, host)})
+#pragma omp declare variant(fn_deduced_variant) match(implementation = {vendor(unknown)}, device = {kind(gpu, nohost)})
 int fn_deduced();
 
 // CHECK: int fn_deduced_variant1();
 int fn_deduced_variant1();
-// CHECK:      #pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(score(0):unknown)},device={kind(cpu, host)})
-// CHECK-NEXT: #pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(score(0):ibm)},device={kind(gpu, nohost)})
+// CHECK:      #pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(unknown)}, device={kind(cpu, host)})
+// CHECK-NEXT: #pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(ibm)}, device={kind(gpu, nohost)})
 // CHECK-NEXT: int fn_deduced1() {
 // CHECK-NEXT: return 0;
 // CHECK-NEXT: }
 #pragma omp declare variant(fn_deduced_variant1) match(xxx = {})
-#pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(ibm)},device={kind(gpu,nohost)})
-#pragma omp declare variant(fn_deduced_variant1) match(implementation={vendor(unknown)},device={kind(cpu,host)})
+#pragma omp declare variant(fn_deduced_variant1) match(implementation = {vendor(ibm)}, device = {kind(gpu, nohost)})
+#pragma omp declare variant(fn_deduced_variant1) match(implementation = {vendor(unknown)}, device = {kind(cpu, host)})
 auto fn_deduced1() { return 0; }
 
 // CHECK:      struct SpecialFuncs {
@@ -140,11 +144,11 @@ auto fn_deduced1() { return 0; }
 // CHECK-NEXT: }
 // CHECK-NEXT: void bar(int) {
 // CHECK-NEXT: }
-// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(score(0):unknown)},device={kind(nohost)})
-// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::bar) match(implementation={vendor(score(0):ibm)},device={kind(cpu)})
+// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(unknown)}, device={kind(nohost)})
+// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::bar) match(implementation={vendor(ibm)}, device={kind(cpu)})
 // CHECK-NEXT: void foo1() {
 // CHECK-NEXT: }
-// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(score(0):unknown)},device={kind(cpu, host)})
+// CHECK-NEXT: #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(unknown)}, device={kind(cpu, host)})
 // CHECK-NEXT: void xxx();
 // CHECK-NEXT: } s;
 struct SpecialFuncs {
@@ -157,14 +161,14 @@ struct SpecialFuncs {
   void bar(int) {}
 #pragma omp declare variant(SpecialFuncs::baz) match(xxx = {})
 #pragma omp declare variant(SpecialFuncs::bar) match(xxx = {})
-#pragma omp declare variant(SpecialFuncs::bar) match(implementation={vendor(ibm)},device={kind(cpu)})
-#pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(unknown)},device={kind(nohost)})
+#pragma omp declare variant(SpecialFuncs::bar) match(implementation = {vendor(ibm)}, device = {kind(cpu)})
+#pragma omp declare variant(SpecialFuncs::baz) match(implementation = {vendor(unknown)}, device = {kind(nohost)})
   void foo1() {}
-#pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(unknown)},device={kind(cpu, host)})
+#pragma omp declare variant(SpecialFuncs::baz) match(implementation = {vendor(unknown)}, device = {kind(cpu, host)})
   void xxx();
 } s;
 
-// CHECK:      #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(score(0):unknown)},device={kind(cpu, host)})
+// CHECK:      #pragma omp declare variant(SpecialFuncs::baz) match(implementation={vendor(unknown)}, device={kind(cpu, host)})
 // CHECK-NEXT: void SpecialFuncs::xxx() {
 // CHECK-NEXT: }
 void SpecialFuncs::xxx() {}
@@ -172,12 +176,12 @@ void SpecialFuncs::xxx() {}
 // CHECK:      static void static_f_variant() {
 // CHECK-NEXT: }
 static void static_f_variant() {}
-// CHECK:      #pragma omp declare variant(static_f_variant) match(implementation={vendor(score(0):unknown)})
-// CHECK-NEXT: #pragma omp declare variant(static_f_variant) match(implementation={vendor(score(0):llvm)},device={kind(fpga)})
+// CHECK:      #pragma omp declare variant(static_f_variant) match(implementation={vendor(unknown)})
+// CHECK-NEXT: #pragma omp declare variant(static_f_variant) match(implementation={vendor(llvm)}, device={kind(fpga)})
 // CHECK-NEXT: static void static_f() {
 // CHECK-NEXT: }
 #pragma omp declare variant(static_f_variant) match(xxx = {})
-#pragma omp declare variant(static_f_variant) match(implementation={vendor(llvm)},device={kind(fpga)})
+#pragma omp declare variant(static_f_variant) match(implementation = {vendor(llvm)}, device = {kind(fpga)})
 #pragma omp declare variant(static_f_variant) match(implementation={vendor(unknown)})
 static void static_f() {}
 
@@ -192,19 +196,19 @@ void bazzzz() {
 
 // CHECK: int fn_linkage_variant();
 // CHECK: extern "C" {
-// CHECK:     #pragma omp declare variant(fn_linkage_variant) match(implementation={vendor(score(0):xxx)},device={kind(cpu, host)})
+// CHECK:     #pragma omp declare variant(fn_linkage_variant) match(implementation={vendor(ti)}, device={kind(cpu, host)})
 // CHECK:     int fn_linkage();
 // CHECK: }
 int fn_linkage_variant();
 extern "C" {
-#pragma omp declare variant(fn_linkage_variant) match(implementation = {vendor(xxx)},device={kind(cpu,host)})
+#pragma omp declare variant(fn_linkage_variant) match(implementation = {vendor(ti)}, device = {kind(cpu, host)})
 int fn_linkage();
 }
 
 // CHECK: extern "C" int fn_linkage_variant1()
-// CHECK: #pragma omp declare variant(fn_linkage_variant1) match(implementation={vendor(score(0):xxx)},device={kind(cpu, host)})
+// CHECK: #pragma omp declare variant(fn_linkage_variant1) match(implementation={vendor(gnu)}, device={kind(cpu, host)})
 // CHECK: int fn_linkage1();
 extern "C" int fn_linkage_variant1();
-#pragma omp declare variant(fn_linkage_variant1) match(implementation = {vendor(xxx)},device={kind(cpu,host)})
+#pragma omp declare variant(fn_linkage_variant1) match(implementation = {vendor(gnu)}, device = {kind(cpu, host)})
 int fn_linkage1();
 
diff --git a/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp b/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp
index 225990d62fc3b..55195ffd43b26 100644
--- a/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp
+++ b/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp
@@ -71,18 +71,18 @@
 
 #pragma omp declare target
 #ifdef HOST
-#define CORRECT host
-#define SUBSET host, cpu
+#define SUBSET host
+#define CORRECT host, cpu
 #define WRONG host, nohost
 #endif // HOST
 #ifdef CPU
-#define CORRECT cpu
-#define SUBSET host, cpu
+#define SUBSET cpu
+#define CORRECT cpu, any
 #define WRONG cpu, gpu
 #endif // CPU
 #ifdef NOHOST
-#define CORRECT nohost
-#define SUBSET nohost, cpu
+#define SUBSET nohost
+#define CORRECT nohost, cpu
 #define WRONG nohost, host
 #endif // NOHOST
 
diff --git a/clang/test/OpenMP/declare_variant_messages.c b/clang/test/OpenMP/declare_variant_messages.c
index 26507629ea370..7b87e696152bd 100644
--- a/clang/test/OpenMP/declare_variant_messages.c
+++ b/clang/test/OpenMP/declare_variant_messages.c
@@ -2,95 +2,102 @@
 
 // RUN: %clang_cc1 -triple=x86_64-pc-win32 -verify -fopenmp-simd -x c -std=c99 -fms-extensions -Wno-pragma-pack %s
 
-// expected-error@+1 {{expected an OpenMP directive}}
-#pragma omp declare
+
+#pragma omp declare // expected-error {{expected an OpenMP directive}}
 
 int foo(void);
 
 #pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}}
-#pragma omp declare variant(  // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo // expected-error {{expected ')'}} expected-error {{expected 'match' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
 #pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}}
 #pragma omp declare variant(foo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foo) xxx // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foo) match // expected-error {{expected '(' after 'match'}}
-#pragma omp declare variant(foo) match( // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foo) match() // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foo) match(xxx) // expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}}
-#pragma omp declare variant(foo) match(xxx=) // expected-error {{expected '{' after '='}}
-#pragma omp declare variant(foo) match(xxx=yyy) // expected-error {{expected '{' after '='}}
-#pragma omp declare variant(foo) match(xxx=yyy}) // expected-error {{expected '{' after '='}}
-#pragma omp declare variant(foo) match(xxx={) // expected-error {{expected '}' or ',' after ')'}} expected-error {{expected '}'}} expected-note {{to match this '{'}}
-#pragma omp declare variant(foo) match(xxx={})
-#pragma omp declare variant(foo) match(xxx={vvv, vvv})
-#pragma omp declare variant(foo) match(xxx={vvv} xxx) // expected-error {{expected ','}} expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}} expected-error {{context selector set 'xxx' is used already in the same 'omp declare variant' directive}} expected-note {{previously context selector set 'xxx' used here}}
-#pragma omp declare variant(foo) match(xxx={vvv}) xxx // expected-warning {{extra tokens at the end of '#pragma omp declare variant' are ignored}}
-#pragma omp declare variant(foo) match(implementation={xxx}) // expected-warning {{unknown context selector in 'implementation' context selector set of 'omp declare variant' directive, ignored}}
-#pragma omp declare variant(foo) match(implementation={vendor}) // expected-error {{expected '(' after 'vendor'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foo) match(implementation={vendor(}) // expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foo) match(implementation={vendor()}) // expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}}
-#pragma omp declare variant(foo) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{missing ':' after context selector score clause - ignoring}}
-#pragma omp declare variant(foo) match(implementation={vendor(score( ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note {{to match this '('}}
-#pragma omp declare variant(foo) match(implementation={vendor(score(2 ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error 2 {{expected ')'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note 2 {{to match this '('}}
-#pragma omp declare variant(foo) match(implementation={vendor(score(foo()) ibm)}) // expected-warning {{missing ':' after context selector score clause - ignoring}} expected-error {{expression is not an integer constant expression}}
-#pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm), vendor(llvm)}) // expected-error {{context trait selector 'vendor' is used already in the same 'implementation' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'vendor' used here}}
-#pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{unknown context selector in 'implementation' context selector set of 'omp declare variant' directive, ignored}}
-#pragma omp declare variant(foo) match(device={xxx}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}}
-#pragma omp declare variant(foo) match(device={kind}) // expected-error {{expected '(' after 'kind'}} expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error {{expected ')'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foo) match(device={kind(}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error 2 {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foo) match(device={kind()}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}}
-#pragma omp declare variant(foo) match(device={kind(score cpu)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foo) match(device={kind(score( ibm)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foo) match(device={kind(score(2 gpu)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foo) match(device={kind(score(foo()) ibm)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foo) match(device={kind(score(5): host), kind(llvm)}) // expected-error {{context trait selector 'kind' is used already in the same 'device' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'kind' used here}} expected-error {{expected ')' or ',' after 'score'}} expected-note {{to match this '('}} expected-error {{expected ')'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} expected-error {{unknown 'llvm' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foo) match(device={kind(score(5): nohost), vendor(llvm)}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}} expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}}
+#pragma omp declare variant(foo) match( // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match() // expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp declare variant(foo) match(xxx) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp declare variant(foo) match(xxx=) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp declare variant(foo) match(xxx=yyy) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp declare variant(foo) match(xxx=yyy}) // expected-error {{expected ')'}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match(xxx={) // expected-error {{expected ')'}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp declare variant(foo) match(xxx={vvv, vvv}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp declare variant(foo) match(xxx={vvv} xxx) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp declare variant(foo) match(xxx={vvv}) xxx // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp declare variant(foo) match(implementation={xxx}) // expected-warning {{'xxx' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foo) match(implementation={vendor}) // expected-warning {{the context selector 'vendor' in context set 'implementation' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foo) match(implementation={vendor(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match(implementation={vendor()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}}
+#pragma omp declare variant(foo) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}}
+#pragma omp declare variant(foo) match(implementation={vendor(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match(implementation={vendor(score(2 ibm)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match(implementation={vendor(score(foo()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{score expressions in the OpenMP context selector need to be constant; foo() is not and will be ignored}}
+#pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm), vendor(llvm)}) // expected-warning {{the context selector 'vendor' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'vendor' used here}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foo) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{the context selector 'kind' is not valid for the context set 'implementation'; selector ignored}} expected-note {{the context selector 'kind' can be nested in the context set 'device'; try 'match(device={kind(property)})'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foo) match(device={xxx}) // expected-warning {{'xxx' is not a valid context selector for the context set 'device'; selector ignored}} expected-note {{context selector options are: 'kind' 'isa' 'arch'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foo) match(device={kind}) // expected-warning {{the context selector 'kind' in context set 'device' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foo) match(device={kind(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match(device={kind()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}}
+#pragma omp declare variant(foo) match(device={kind(score cpu)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}}
+#pragma omp declare variant(foo) match(device={kind(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match(device={kind(score(2 gpu)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('2'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match(device={kind(score(foo()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('foo()'); score ignored}} expected-warning {{'ibm' is not a valid context property for the context selector 'kind' and the context set 'device'; property ignored}} expected-note {{try 'match(implementation={vendor(ibm)})'}} expected-note {{the ignored property spans until here}}
+#pragma omp declare variant(foo) match(device={kind(score(5): host), kind(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('5'); score ignored}} expected-warning {{the context selector 'kind' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'kind' used here}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foo) match(device={kind(score(5): nohost), vendor(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('5'); score ignored}} expected-warning {{the context selector 'vendor' is not valid for the context set 'device'; selector ignored}} expected-note {{the context selector 'vendor' can be nested in the context set 'implementation'; try 'match(implementation={vendor(property)})'}} expected-note {{the ignored selector spans until here}}
 int bar(void);
 
-// expected-error@+2 {{'#pragma omp declare variant' can only be applied to functions}}
-#pragma omp declare variant(foo) match(xxx={})
-int a;
-// expected-error@+2 {{'#pragma omp declare variant' can only be applied to functions}}
-#pragma omp declare variant(foo) match(xxx={})
-#pragma omp threadprivate(a)
+#pragma omp declare variant(foo) match(implementation = {vendor(score(foo) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo is not and will be ignored}}
+#pragma omp declare variant(foo) match(implementation = {vendor(score(foo()) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo() is not and will be ignored}}
+#pragma omp declare variant(foo) match(implementation = {vendor(score(<expr>) :llvm)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}}
+#pragma omp declare variant(foo) match(user = {condition(foo)}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo is not}}
+#pragma omp declare variant(foo) match(user = {condition(foo())}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo() is not}}
+#pragma omp declare variant(foo) match(user = {condition(<expr>)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}}
+int score_and_cond_non_const();
+
+#pragma omp declare variant(foo) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+int a; // expected-error {{'#pragma omp declare variant' can only be applied to functions}}
+
+#pragma omp declare variant(foo) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp threadprivate(a) // expected-error {{'#pragma omp declare variant' can only be applied to functions}}
 int var;
 #pragma omp threadprivate(var)
 
-// expected-error@+2 {{expected an OpenMP directive}} expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant(foo) match(xxx={})
-#pragma omp declare
 
-// expected-error@+3 {{function declaration is expected after 'declare variant' directive}}
-// expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant(foo) match(xxx={})
-#pragma omp declare variant(foo) match(xxx={})
+#pragma omp declare variant(foo) match(xxx={}) // expected-error {{function declaration is expected after 'declare variant' directive}}
+#pragma omp declare // expected-error {{expected an OpenMP directive}}
+
+
+
+#pragma omp declare variant(foo) match(xxx={}) // expected-error {{function declaration is expected after 'declare variant' directive}}
+#pragma omp declare variant(foo) match(xxx={}) // expected-error {{function declaration is expected after 'declare variant' directive}}
 #pragma options align=packed
 int main();
 
-// expected-error@+3 {{function declaration is expected after 'declare variant' directive}}
-// expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant(foo) match(xxx={})
-#pragma omp declare variant(foo) match(xxx={})
+
+
+#pragma omp declare variant(foo) match(xxx={}) // expected-error {{function declaration is expected after 'declare variant' directive}}
+#pragma omp declare variant(foo) match(xxx={}) // expected-error {{function declaration is expected after 'declare variant' directive}}
 #pragma init_seg(compiler)
 int main();
 
-// expected-error@+1 {{single declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant(foo) match(xxx={})
+
+#pragma omp declare variant(foo) match(xxx={}) // expected-error {{single declaration is expected after 'declare variant' directive}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 int b, c;
 
 int no_proto();
-#pragma omp declare variant(no_proto) match(xxx={})
+#pragma omp declare variant(no_proto) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 int no_proto_too();
 
 int proto1(int);
-// expected-note@+2 {{previous declaration is here}}
-#pragma omp declare variant(proto1) match(xxx={})
-int diff_proto();
-// expected-error@+1 {{conflicting types for 'diff_proto'}}
-int diff_proto(double);
 
-#pragma omp declare variant(no_proto) match(xxx={})
+#pragma omp declare variant(proto1) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+int diff_proto(); // expected-note {{previous declaration is here}}
+
+int diff_proto(double); // expected-error {{conflicting types for 'diff_proto'}}
+
+#pragma omp declare variant(no_proto) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 int diff_proto1(double);
 
 int after_use_variant(void);
@@ -99,37 +106,37 @@ int bar() {
   return after_use();
 }
 
-// expected-warning@+1 {{'#pragma omp declare variant' cannot be applied for function after first usage; the original function might be used}}
-#pragma omp declare variant(after_use_variant) match(xxx={})
+
+#pragma omp declare variant(after_use_variant) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-warning {{'#pragma omp declare variant' cannot be applied for function after first usage; the original function might be used}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 int after_use(void);
-#pragma omp declare variant(after_use_variant) match(xxx={})
+#pragma omp declare variant(after_use_variant) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 int defined(void) { return 0; }
 int defined1(void) { return 0; }
-// expected-warning@+1 {{#pragma omp declare variant' cannot be applied to the function that was defined already; the original function might be used}}
-#pragma omp declare variant(after_use_variant) match(xxx={})
+
+#pragma omp declare variant(after_use_variant) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-warning {{'#pragma omp declare variant' cannot be applied to the function that was defined already; the original function might be used}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 int defined1(void);
 
 
 int diff_cc_variant(void);
-// expected-error@+1 {{variant in '#pragma omp declare variant' with type 'int (void)' is incompatible with type 'int (void) __attribute__((vectorcall))'}}
-#pragma omp declare variant(diff_cc_variant) match(xxx={})
+
+#pragma omp declare variant(diff_cc_variant) match(xxx={}) // expected-error {{variant in '#pragma omp declare variant' with type 'int (void)' is incompatible with type 'int (void) __attribute__((vectorcall))'}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 __vectorcall int diff_cc(void);
 
 int diff_ret_variant(void);
-// expected-error@+1 {{variant in '#pragma omp declare variant' with type 'int (void)' is incompatible with type 'void (void)'}}
-#pragma omp declare variant(diff_ret_variant) match(xxx={})
+
+#pragma omp declare variant(diff_ret_variant) match(xxx={}) // expected-error {{variant in '#pragma omp declare variant' with type 'int (void)' is incompatible with type 'void (void)'}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 void diff_ret(void);
 
 void marked(void);
 void not_marked(void);
-// expected-note@+1 {{marked as 'declare variant' here}}
-#pragma omp declare variant(not_marked) match(implementation={vendor(unknown)}, device={kind(cpu)})
+
+#pragma omp declare variant(not_marked) match(implementation={vendor(unknown)}, device={kind(cpu)}) // expected-note {{marked as 'declare variant' here}}
 void marked_variant(void);
-// expected-warning@+1 {{variant function in '#pragma omp declare variant' is itself marked as '#pragma omp declare variant'}}
-#pragma omp declare variant(marked_variant) match(xxx={})
+
+#pragma omp declare variant(marked_variant) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-warning {{variant function in '#pragma omp declare variant' is itself marked as '#pragma omp declare variant'}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 void marked(void);
 
-// expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant
-// expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant
+
+#pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}}
+
+#pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}}
diff --git a/clang/test/OpenMP/declare_variant_messages.cpp b/clang/test/OpenMP/declare_variant_messages.cpp
index ca1e4c33d17ee..f9950a88241c4 100644
--- a/clang/test/OpenMP/declare_variant_messages.cpp
+++ b/clang/test/OpenMP/declare_variant_messages.cpp
@@ -2,137 +2,180 @@
 
 // RUN: %clang_cc1 -triple=x86_64-pc-win32 -verify -fopenmp-simd -x c++ -std=c++14 -fms-extensions -Wno-pragma-pack -fexceptions -fcxx-exceptions %s
 
-// expected-error@+1 {{expected an OpenMP directive}}
-#pragma omp declare
+
+#pragma omp declare // expected-error {{expected an OpenMP directive}}
 
 int foo();
 
 template <typename T>
-T foofoo(); // expected-note 2 {{declared here}}
-
-#pragma omp declare variant                                  // expected-error {{expected '(' after 'declare variant'}}
-#pragma omp declare variant(                                 // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foo                              // expected-error {{expected ')'}} expected-error {{expected 'match' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
-#pragma omp declare variant(x)                               // expected-error {{use of undeclared identifier 'x'}}
-#pragma omp declare variant(foo)                             // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <int>)                    // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <int>) xxx                // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <int>) match              // expected-error {{expected '(' after 'match'}}
-#pragma omp declare variant(foofoo <int>) match(             // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <int>) match()            // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <int>) match(xxx)         // expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <int>) match(xxx =)       // expected-error {{expected '{' after '='}}
-#pragma omp declare variant(foofoo <int>) match(xxx = yyy)   // expected-error {{expected '{' after '='}}
-#pragma omp declare variant(foofoo <int>) match(xxx = yyy }) // expected-error {{expected '{' after '='}}
-#pragma omp declare variant(foofoo <int>) match(xxx = {)     // expected-error {{expected '}' or ',' after ')'}} expected-error {{expected '}'}} expected-note {{to match this '{'}}
-#pragma omp declare variant(foofoo <int>) match(xxx = {})
-#pragma omp declare variant(foofoo <int>) match(xxx = {vvv, vvv})
-#pragma omp declare variant(foofoo <int>) match(xxx = {vvv} xxx) // expected-error {{expected ','}} expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}} expected-error {{context selector set 'xxx' is used already in the same 'omp declare variant' directive}} expected-note {{previously context selector set 'xxx' used here}}
-#pragma omp declare variant(foofoo <int>) match(xxx = {vvv}) xxx // expected-warning {{extra tokens at the end of '#pragma omp declare variant' are ignored}}
-#pragma omp declare variant(foofoo <int>) match(implementation={xxx}) // expected-warning {{unknown context selector in 'implementation' context selector set of 'omp declare variant' directive, ignored}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor}) // expected-error {{expected '(' after 'vendor'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(}) // expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor()}) // expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{missing ':' after context selector score clause - ignoring}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score( ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note {{to match this '('}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(2 ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error 2 {{expected ')'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note 2 {{to match this '('}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(foofoo <int>()) ibm)}) // expected-warning {{missing ':' after context selector score clause - ignoring}} expected-error {{expression is not an integral constant expression}} expected-note {{non-constexpr function 'foofoo<int>' cannot be used in a constant expression}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(5): ibm), vendor(llvm)}) // expected-error {{context trait selector 'vendor' is used already in the same 'implementation' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'vendor' used here}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{unknown context selector in 'implementation' context selector set of 'omp declare variant' directive, ignored}}
-#pragma omp declare variant(foofoo <int>) match(device={xxx}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}}
-#pragma omp declare variant(foofoo <int>) match(device={kind}) // expected-error {{expected '(' after 'kind'}} expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error {{expected ')'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error 2 {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foofoo <int>) match(device={kind()}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score cpu)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score( ibm)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score(2 gpu)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score(foofoo <int>()) ibm)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score(5): host), kind(llvm)}) // expected-error {{context trait selector 'kind' is used already in the same 'device' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'kind' used here}} expected-error {{expected ')' or ',' after 'score'}} expected-note {{to match this '('}} expected-error {{expected ')'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} expected-error {{unknown 'llvm' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score(5): nohost), vendor(llvm)}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}} expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
+T foofoo();
+
+#pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}}
+#pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo // expected-error {{expected ')'}} expected-error {{expected 'match' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
+#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}}
+#pragma omp declare variant(foo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(foofoo <int>) // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(foofoo <int>) xxx // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(foofoo <int>) match // expected-error {{expected '(' after 'match'}}
+#pragma omp declare variant(foofoo <int>) match( // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match() // expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation) // expected-warning {{expected '=' after the context set name "implementation"; '=' assumed}} expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation =) // expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation = yyy) // expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{'yyy' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation = yyy }) // expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{'yyy' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation = {) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation = {vvv, vvv}) // expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation = {vvv} implementation) // expected-error {{expected ')'}} expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(implementation = {vvv}) implementation // expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation={xxx}) // expected-warning {{'xxx' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor}) // expected-warning {{the context selector 'vendor' in context set 'implementation' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(2 ibm)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(foofoo <int>()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{score expressions in the OpenMP context selector need to be constant; foofoo<int>() is not and will be ignored}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(5): ibm), vendor(llvm)}) // expected-warning {{the context selector 'vendor' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'vendor' used here}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{the context selector 'kind' is not valid for the context set 'implementation'; selector ignored}} expected-note {{the context selector 'kind' can be nested in the context set 'device'; try 'match(device={kind(property)})'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(device={xxx}) // expected-warning {{'xxx' is not a valid context selector for the context set 'device'; selector ignored}} expected-note {{context selector options are: 'kind' 'isa' 'arch'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(device={kind}) // expected-warning {{the context selector 'kind' in context set 'device' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(device={kind()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score cpu)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score(2 gpu)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('2'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score(foofoo <int>()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('foofoo<int>()'); score ignored}} expected-warning {{'ibm' is not a valid context property for the context selector 'kind' and the context set 'device'; property ignored}} expected-note {{try 'match(implementation={vendor(ibm)})'}} expected-note {{the ignored property spans until here}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score(5): host), kind(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('5'); score ignored}} expected-warning {{the context selector 'kind' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'kind' used here}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score(5): nohost), vendor(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('5'); score ignored}} expected-warning {{the context selector 'vendor' is not valid for the context set 'device'; selector ignored}} expected-note {{the context selector 'vendor' can be nested in the context set 'implementation'; try 'match(implementation={vendor(property)})'}} expected-note {{the ignored selector spans until here}}
 int bar();
 
-#pragma omp declare variant                            // expected-error {{expected '(' after 'declare variant'}}
-#pragma omp declare variant(                           // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foofoo <T>                 // expected-error {{expected ')'}} expected-error {{expected 'match' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
-#pragma omp declare variant(x)                         // expected-error {{use of undeclared identifier 'x'}}
-#pragma omp declare variant(foo)                       // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo)                    // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <T>)                // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <T>) xxx            // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <T>) match          // expected-error {{expected '(' after 'match'}}
-#pragma omp declare variant(foofoo <T>) match(         // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <T>) match()        // expected-error {{expected context selector in 'match' clause on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <T>) match(xxx)     // expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <T>) match(xxx =)   // expected-error {{expected '{' after '='}}
-#pragma omp declare variant(foofoo <T>) match(xxx = {) // expected-error {{expected '}' or ',' after ')'}} expected-error {{expected '}'}} expected-note {{to match this '{'}}
-#pragma omp declare variant(foofoo <T>) match(xxx = {})
-#pragma omp declare variant(foofoo <T>) match(xxx = {vvv, vvv})
-#pragma omp declare variant(foofoo <T>) match(user = {score(<expr>) : condition(<expr>)})
-#pragma omp declare variant(foofoo <T>) match(user = {score(<expr>) : condition(<expr>)})
-#pragma omp declare variant(foofoo <T>) match(user = {condition(<expr>)})
-#pragma omp declare variant(foofoo <T>) match(user = {condition(<expr>)})
-#pragma omp declare variant(foofoo <T>) match(xxx = {vvv} xxx) // expected-error {{expected ','}} expected-error {{expected '=' after 'xxx' context selector set name on 'omp declare variant' directive}} expected-error {{context selector set 'xxx' is used already in the same 'omp declare variant' directive}} expected-note {{previously context selector set 'xxx' used here}}
-#pragma omp declare variant(foofoo <T>) match(xxx = {vvv}) xxx // expected-warning {{extra tokens at the end of '#pragma omp declare variant' are ignored}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{missing ':' after context selector score clause - ignoring}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score( ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error {{expected ')'}} expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note {{to match this '('}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(C ibm)}) // expected-error {{expected ')' or ',' after 'vendor name'}} expected-error 2 {{expected ')'}} expected-error {{expected vendor identifier in 'vendor' context selector of 'implementation' selector set of 'omp declare variant' directive}} expected-warning {{missing ':' after context selector score clause - ignoring}} expected-note 2 {{to match this '('}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(foofoo <int>()) ibm)}) // expected-warning {{missing ':' after context selector score clause - ignoring}} expected-error {{expression is not an integral constant expression}} expected-note {{non-constexpr function 'foofoo<int>' cannot be used in a constant expression}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(C+5): ibm), vendor(llvm)}) // expected-error {{context trait selector 'vendor' is used already in the same 'implementation' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'vendor' used here}}
-#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{unknown context selector in 'implementation' context selector set of 'omp declare variant' directive, ignored}}
-#pragma omp declare variant(foofoo <int>) match(device={xxx}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}}
-#pragma omp declare variant(foofoo <int>) match(device={kind}) // expected-error {{expected '(' after 'kind'}} expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error {{expected ')'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}} expected-error 2 {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp declare variant(foofoo <int>) match(device={kind()}) // expected-error {{expected 'host', 'nohost', 'cpu', 'gpu', or 'fpga' in 'kind' context selector of 'device' selector set of 'omp declare variant' directive}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score cpu)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score( ibm)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score(C gpu)}) // expected-error 2 {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score(foofoo <int>()) ibm)}) // expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score(C+5): host), kind(llvm)}) // expected-error {{context trait selector 'kind' is used already in the same 'device' context selector set of 'omp declare variant' directive}} expected-note {{previously context trait selector 'kind' used here}} expected-error {{expected ')' or ',' after 'score'}} expected-note {{to match this '('}} expected-error {{expected ')'}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}} expected-error {{unknown 'llvm' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
-#pragma omp declare variant(foofoo <int>) match(device={kind(score(C+5): nohost), vendor(llvm)}) // expected-warning {{unknown context selector in 'device' context selector set of 'omp declare variant' directive, ignored}} expected-error {{expected ')' or ',' after 'score'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{unknown 'score' device kind trait in the 'device' context selector set, expected one of 'host', 'nohost', 'cpu', 'gpu' or 'fpga'}}
+#pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}}
+#pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <T> // expected-error {{expected ')'}} expected-error {{expected 'match' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
+#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}}
+#pragma omp declare variant(foo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(foofoo) // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(foofoo <T>) // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(foofoo <T>) xxx // expected-error {{expected 'match' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(foofoo <T>) match // expected-error {{expected '(' after 'match'}}
+#pragma omp declare variant(foofoo <T>) match( // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <T>) match() // expected-warning {{expected identifier or string literal describing a context set; set skipped}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+#pragma omp declare variant(foofoo <T>) match(implementation) // expected-warning {{expected '=' after the context set name "implementation"; '=' assumed}} expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <T>) match(implementation =) // expected-warning {{expected '{' after the '=' that follows the context set name "implementation"; '{' assumed}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <T>) match(implementation = {) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{expected '}' after the context selectors for the context set "implementation"; '}' assumed}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <T>) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <T>) match(implementation = {vvv, vvv}) // expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <T>) match(user = {score(<expr>) : condition(<expr>)}) // expected-warning {{'score' is not a valid context selector for the context set 'user'; selector ignored}} expected-note {{context selector options are: 'condition'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <T>) match(user = {score(<expr>) : condition(<expr>)}) // expected-warning {{'score' is not a valid context selector for the context set 'user'; selector ignored}} expected-note {{context selector options are: 'condition'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <T>) match(user = {condition(<expr>)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <T>) match(user = {condition(<expr>)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <T>) match(implementation = {vvv} implementation) // expected-error {{expected ')'}} expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <T>) match(implementation = {vvv}) xxx // expected-warning {{'vvv' is not a valid context selector for the context set 'implementation'; selector ignored}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score ibm)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(C ibm)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'amd' 'arm' 'bsc' 'cray' 'fujitsu' 'gnu' 'ibm' 'intel' 'llvm' 'pgi' 'ti' 'unknown'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(foofoo <int>()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(C+5): ibm), vendor(llvm)}) // expected-warning {{the context selector 'vendor' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'vendor' used here}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(implementation={vendor(score(5): ibm), kind(cpu)}) // expected-warning {{the context selector 'kind' is not valid for the context set 'implementation'; selector ignored}} expected-note {{the context selector 'kind' can be nested in the context set 'device'; try 'match(device={kind(property)})'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(device={xxx}) // expected-warning {{'xxx' is not a valid context selector for the context set 'device'; selector ignored}} expected-note {{context selector options are: 'kind' 'isa' 'arch'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(device={kind}) // expected-warning {{the context selector 'kind' in context set 'device' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(device={kind()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score cpu)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score( ibm)}) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score(C gpu)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('C'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score(foofoo <int>()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('foofoo<int>()'); score ignored}} expected-warning {{'ibm' is not a valid context property for the context selector 'kind' and the context set 'device'; property ignored}} expected-note {{try 'match(implementation={vendor(ibm)})'}} expected-note {{the ignored property spans until here}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score(C+5): host), kind(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('C + 5'); score ignored}} expected-warning {{the context selector 'kind' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'kind' used here}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(foofoo <int>) match(device={kind(score(C+5): nohost), vendor(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('C + 5'); score ignored}} expected-warning {{the context selector 'vendor' is not valid for the context set 'device'; selector ignored}} expected-note {{the context selector 'vendor' can be nested in the context set 'implementation'; try 'match(implementation={vendor(property)})'}} expected-note {{the ignored selector spans until here}}
 template <typename T, int C>
 T barbar();
 
-// expected-error@+2 {{'#pragma omp declare variant' can only be applied to functions}}
-#pragma omp declare variant(barbar <int>) match(xxx = {})
-int a;
-// expected-error@+2 {{'#pragma omp declare variant' can only be applied to functions}}
-#pragma omp declare variant(barbar <int>) match(xxx = {})
-#pragma omp threadprivate(a)
+#pragma omp declare variant(foo) match(implementation = {vendor(score(foo) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo is not and will be ignored}}
+#pragma omp declare variant(foo) match(implementation = {vendor(score(foo()) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo() is not and will be ignored}}
+#pragma omp declare variant(foo) match(implementation = {vendor(score(<expr>) :llvm)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}}
+#pragma omp declare variant(foo) match(user = {condition(foo)}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo is not}}
+#pragma omp declare variant(foo) match(user = {condition(foo())}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo() is not}}
+#pragma omp declare variant(foo) match(user = {condition(<expr>)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}}
+int score_and_cond_non_const();
+
+#pragma omp declare variant(foo) match(implementation = {vendor(score(foo) :llvm)})
+#pragma omp declare variant(foo) match(implementation = {vendor(score(foo()) :llvm)})
+#pragma omp declare variant(foo) match(implementation = {vendor(score(<expr>) :llvm)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}}
+#pragma omp declare variant(foo) match(user = {condition(foo)})
+#pragma omp declare variant(foo) match(user = {condition(foo())})
+#pragma omp declare variant(foo) match(user = {condition(<expr>)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}}
+template<int C>
+int score_and_cond_non_const_no_inst();
+
+#pragma omp declare variant(foo) match(implementation = {vendor(score(foo) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo is not and will be ignored}}
+#pragma omp declare variant(foo) match(implementation = {vendor(score(foo()) :llvm)}) // expected-warning {{score expressions in the OpenMP context selector need to be constant; foo() is not and will be ignored}}
+#pragma omp declare variant(foo) match(implementation = {vendor(score(<expr>) :llvm)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'expr'}} expected-error {{expected expression}}
+#pragma omp declare variant(foo) match(user = {condition(foo)}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo is not}}
+#pragma omp declare variant(foo) match(user = {condition(foo())}) // expected-error {{the user condition in the OpenMP context selector needs to be constant; foo() is not}}
+#pragma omp declare variant(foo) match(user = {condition(<C>)}) // expected-error {{expected expression}} expected-error {{expected expression}} expected-note {{the ignored selector spans until here}}
+template<int C>
+int score_and_cond_non_const_inst();
+
+constexpr int constexpr_fn(int i) { return 7 * i; }
+#pragma omp declare variant(foo) match(implementation = {vendor(score(constexpr_fn(3)) : llvm)})
+#pragma omp declare variant(foo) match(user = {condition(constexpr_fn(1))})
+int score_and_cond_const();
+
+#pragma omp declare variant(foo) match(implementation = {vendor(score(constexpr_fn(3)) : llvm)})
+#pragma omp declare variant(foo) match(implementation = {vendor(score(constexpr_fn(C)) : llvm)})
+#pragma omp declare variant(foo) match(user = {condition(constexpr_fn(1))})
+#pragma omp declare variant(foo) match(user = {condition(constexpr_fn(C))})
+template <int C>
+int score_and_cond_const_inst();
+
+void score_and_cond_inst() {
+  score_and_cond_non_const();
+  score_and_cond_non_const_inst<8>(); // expected-note {{in instantiation of function template specialization 'score_and_cond_non_const_inst<8>' requested here}}
+  score_and_cond_const_inst<9>();
+}
+
+#pragma omp declare variant(barbar <int>) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+int a; // expected-error {{'#pragma omp declare variant' can only be applied to functions}}
+
+#pragma omp declare variant(barbar <int>) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp threadprivate(a) // expected-error {{'#pragma omp declare variant' can only be applied to functions}}
 int var;
 #pragma omp threadprivate(var)
 
-// expected-error@+2 {{expected an OpenMP directive}} expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant(barbar <int>) match(xxx = {})
-#pragma omp declare
 
-// expected-error@+3 {{function declaration is expected after 'declare variant' directive}}
-// expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant(barbar <int>) match(xxx = {})
-#pragma omp declare variant(barbar <int>) match(xxx = {})
+#pragma omp declare variant(barbar <int>) match(implementation = {}) // expected-error {{function declaration is expected after 'declare variant' directive}}
+#pragma omp declare // expected-error {{expected an OpenMP directive}}
+
+
+
+#pragma omp declare variant(barbar <int>) match(implementation = {}) // expected-error {{function declaration is expected after 'declare variant' directive}}
+#pragma omp declare variant(barbar <int>) match(xxx = {}) // expected-error {{function declaration is expected after 'declare variant' directive}}
 #pragma options align = packed
 int main();
 
-// expected-error@+3 {{function declaration is expected after 'declare variant' directive}}
-// expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant(barbar <int>) match(xxx = {})
-#pragma omp declare variant(barbar <int>) match(xxx = {})
+
+
+#pragma omp declare variant(barbar <int>) match(implementation = {}) // expected-error {{function declaration is expected after 'declare variant' directive}}
+#pragma omp declare variant(barbar <int>) match(xxx = {}) // expected-error {{function declaration is expected after 'declare variant' directive}}
 #pragma init_seg(compiler)
 int main();
 
-// expected-error@+1 {{single declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant(barbar <int>) match(xxx = {})
+
+#pragma omp declare variant(barbar <int>) match(implementation = {}) // expected-error {{single declaration is expected after 'declare variant' directive}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
 int b, c;
 
-// expected-error@+1 {{'C' does not refer to a value}}
-#pragma omp declare variant(C) match(xxx = {})
-// expected-note@+1 {{declared here}}
-template <class C>
+
+#pragma omp declare variant(C) match(implementation = {}) // expected-error {{'C' does not refer to a value}}
+
+template <class C> // expected-note {{declared here}}
 void h(C *hp, C *hp2, C *hq, C *lin) {
   b = 0;
 }
 
-// expected-error@+1 {{variant in '#pragma omp declare variant' with type '<overloaded function type>' is incompatible with type 'void (int *, int *, int *, int *)'}}
-#pragma omp declare variant(barbar <int>) match(xxx = {})
+
+#pragma omp declare variant(barbar <int>) match(implementation = {}) // expected-error {{variant in '#pragma omp declare variant' with type '<overloaded function type>' is incompatible with type 'void (int *, int *, int *, int *)'}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
 template <>
 void h(int *hp, int *hp2, int *hq, int *lin);
 
@@ -142,113 +185,113 @@ int bar() {
   return after_use();
 }
 
-// expected-warning@+1 {{'#pragma omp declare variant' cannot be applied for function after first usage; the original function might be used}}
-#pragma omp declare variant(after_use_variant) match(xxx = {})
+
+#pragma omp declare variant(after_use_variant) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{'#pragma omp declare variant' cannot be applied for function after first usage; the original function might be used}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
 int after_use(void);
 
 int fn();
 int fn(int);
-#pragma omp declare variant(fn) match(xxx = {})
+#pragma omp declare variant(fn) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 int overload(void);
 
 int fn1();
 int fn1(int);
-// expected-error@+1 {{variant in '#pragma omp declare variant' with type '<overloaded function type>' is incompatible with type 'int (float)'}}
-#pragma omp declare variant(fn1) match(xxx = {})
+
+#pragma omp declare variant(fn1) match(implementation = {}) // expected-error {{variant in '#pragma omp declare variant' with type '<overloaded function type>' is incompatible with type 'int (float)'}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
 int overload1(float);
 
 int fn_constexpr_variant();
-// expected-error@+2 {{'#pragma omp declare variant' does not support constexpr functions}}
-#pragma omp declare variant(fn_constexpr_variant) match(xxx = {})
-constexpr int fn_constexpr();
+
+#pragma omp declare variant(fn_constexpr_variant) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+constexpr int fn_constexpr(); // expected-error {{'#pragma omp declare variant' does not support constexpr functions}}
 
 constexpr int fn_constexpr_variant1();
-// expected-error@+1 {{'#pragma omp declare variant' does not support constexpr functions}}
-#pragma omp declare variant(fn_constexpr_variant1) match(xxx = {})
+
+#pragma omp declare variant(fn_constexpr_variant1) match(implementation = {}) // expected-error {{'#pragma omp declare variant' does not support constexpr functions}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
 int fn_constexpr1();
 
 int fn_sc_variant();
-// expected-error@+1 {{function with '#pragma omp declare variant' has a different storage class}}
-#pragma omp declare variant(fn_sc_variant) match(xxx = {})
+
+#pragma omp declare variant(fn_sc_variant) match(xxx = {}) // expected-error {{function with '#pragma omp declare variant' has a different storage class}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 static int fn_sc();
 
 static int fn_sc_variant1();
-// expected-error@+1 {{function with '#pragma omp declare variant' has a different storage class}}
-#pragma omp declare variant(fn_sc_variant1) match(xxx = {})
+
+#pragma omp declare variant(fn_sc_variant1) match(implementation = {}) // expected-error {{function with '#pragma omp declare variant' has a different storage class}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
 int fn_sc1();
 
 int fn_inline_variant();
-// expected-error@+1 {{function with '#pragma omp declare variant' has a different inline specification}}
-#pragma omp declare variant(fn_inline_variant) match(xxx = {})
+
+#pragma omp declare variant(fn_inline_variant) match(xxx = {}) // expected-error {{function with '#pragma omp declare variant' has a different inline specification}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 inline int fn_inline();
 
 inline int fn_inline_variant1();
-// expected-error@+1 {{function with '#pragma omp declare variant' has a different inline specification}}
-#pragma omp declare variant(fn_inline_variant1) match(xxx = {})
+
+#pragma omp declare variant(fn_inline_variant1) match(implementation = {}) // expected-error {{function with '#pragma omp declare variant' has a different inline specification}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
 int fn_inline1();
 
 auto fn_deduced_variant() { return 0; }
-#pragma omp declare variant(fn_deduced_variant) match(xxx = {})
+#pragma omp declare variant(fn_deduced_variant) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 int fn_deduced();
 
 int fn_deduced_variant1();
-#pragma omp declare variant(fn_deduced_variant1) match(xxx = {})
+#pragma omp declare variant(fn_deduced_variant1) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
 auto fn_deduced1() { return 0; }
 
 auto fn_deduced3() { return 0; }
-// expected-warning@+1 {{'#pragma omp declare variant' cannot be applied to the function that was defined already; the original function might be used}}
-#pragma omp declare variant(fn_deduced_variant1) match(xxx = {})
+
+#pragma omp declare variant(fn_deduced_variant1) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-warning {{'#pragma omp declare variant' cannot be applied to the function that was defined already; the original function might be used}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
 auto fn_deduced3();
 
 auto fn_deduced_variant2() { return 0; }
-// expected-error@+1 {{variant in '#pragma omp declare variant' with type 'int ()' is incompatible with type 'float ()'}}
-#pragma omp declare variant(fn_deduced_variant2) match(xxx = {})
+
+#pragma omp declare variant(fn_deduced_variant2) match(xxx = {}) // expected-error {{variant in '#pragma omp declare variant' with type 'int ()' is incompatible with type 'float ()'}} expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 float fn_deduced2();
 
-// expected-error@+1 {{exception specification in declaration does not match previous declaration}}
-int fn_except_variant() noexcept(true);
-// expected-note@+2 {{previous declaration is here}}
-#pragma omp declare variant(fn_except_variant) match(xxx = {})
-int fn_except() noexcept(false);
 
-// expected-error@+1 {{exception specification in declaration does not match previous declaration}}
-int fn_except_variant1() noexcept(false);
-// expected-note@+2 {{previous declaration is here}}
-#pragma omp declare variant(fn_except_variant1) match(xxx = {})
-int fn_except1() noexcept(true);
+int fn_except_variant() noexcept(true); // expected-error {{exception specification in declaration does not match previous declaration}}
+
+#pragma omp declare variant(fn_except_variant) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+int fn_except() noexcept(false); // expected-note {{previous declaration is here}}
+
+
+int fn_except_variant1() noexcept(false); // expected-error {{exception specification in declaration does not match previous declaration}}
+
+#pragma omp declare variant(fn_except_variant1) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+int fn_except1() noexcept(true); // expected-note {{previous declaration is here}}
 
 struct SpecialFuncs {
   void vd();
-  // expected-error@+2 {{'#pragma omp declare variant' does not support constructors}}
-#pragma omp declare variant(SpecialFuncs::vd) match(xxx = {})
-  SpecialFuncs();
-  // expected-error@+2 {{'#pragma omp declare variant' does not support destructors}}
-#pragma omp declare variant(SpecialFuncs::vd) match(xxx = {})
-  ~SpecialFuncs();
+
+#pragma omp declare variant(SpecialFuncs::vd) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+  SpecialFuncs(); // expected-error {{'#pragma omp declare variant' does not support constructors}}
+
+#pragma omp declare variant(SpecialFuncs::vd) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+  ~SpecialFuncs(); // expected-error {{'#pragma omp declare variant' does not support destructors}}
 
   void baz();
   void bar();
   void bar(int);
-#pragma omp declare variant(SpecialFuncs::baz) match(xxx = {})
-#pragma omp declare variant(SpecialFuncs::bar) match(xxx = {})
-// expected-error@+1 {{variant in '#pragma omp declare variant' with type 'int (*)()' is incompatible with type 'void (SpecialFuncs::*)()'}}
-#pragma omp declare variant(fn_sc_variant1) match(xxx = {})
+#pragma omp declare variant(SpecialFuncs::baz) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+#pragma omp declare variant(SpecialFuncs::bar) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+
+#pragma omp declare variant(fn_sc_variant1) match(implementation = {}) // expected-error {{variant in '#pragma omp declare variant' with type 'int (*)()' is incompatible with type 'void (SpecialFuncs::*)()'}} expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
   void foo1();
   SpecialFuncs& foo(const SpecialFuncs&);
   SpecialFuncs& bar(SpecialFuncs&&);
-  // expected-error@+2 {{'#pragma omp declare variant' does not support defaulted functions}}
-#pragma omp declare variant(SpecialFuncs::foo) match(xxx = {})
-  SpecialFuncs& operator=(const SpecialFuncs&) = default;
-  // expected-error@+2 {{'#pragma omp declare variant' does not support deleted functions}}
-#pragma omp declare variant(SpecialFuncs::bar) match(xxx = {})
-  SpecialFuncs& operator=(SpecialFuncs&&) = delete;
+
+#pragma omp declare variant(SpecialFuncs::foo) match(xxx = {}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
+  SpecialFuncs& operator=(const SpecialFuncs&) = default; // expected-error {{'#pragma omp declare variant' does not support defaulted functions}}
+
+#pragma omp declare variant(SpecialFuncs::bar) match(implementation = {}) // expected-warning {{expected identifier or string literal describing a context selector; selector skipped}} expected-note {{context selector options are: 'vendor' 'extension' 'unified_address' 'unified_shared_memory' 'reverse_offload' 'dynamic_allocators' 'atomic_default_mem_order'}} expected-note {{the ignored selector spans until here}}
+  SpecialFuncs& operator=(SpecialFuncs&&) = delete; // expected-error {{'#pragma omp declare variant' does not support deleted functions}}
 };
 
 namespace N {
-// expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant
+
+#pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}}
 } // namespace N
-// expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant
-// expected-error@+1 {{function declaration is expected after 'declare variant' directive}}
-#pragma omp declare variant
+
+#pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}}
+
+#pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}}
diff --git a/clang/test/OpenMP/declare_variant_mixed_codegen.cpp b/clang/test/OpenMP/declare_variant_mixed_codegen.cpp
index 0c13f5f2f1205..4609a4f77728f 100644
--- a/clang/test/OpenMP/declare_variant_mixed_codegen.cpp
+++ b/clang/test/OpenMP/declare_variant_mixed_codegen.cpp
@@ -49,7 +49,7 @@ int call() { return 1; }
 static int stat_unused_no_emit() { return 1; }
 static int stat_unused_();
 #pragma omp declare variant(stat_unused_) match(implementation = {vendor(llvm)}, device={kind(cpu)})
-#pragma omp declare variant(stat_unused_no_emit) match(implementation = {vendor(xxx)}, device={kind(gpu)})
+#pragma omp declare variant(stat_unused_no_emit) match(implementation = {vendor(unknown)}, device = {kind(gpu)})
 static int stat_unused() { return 1; }
 
 static int stat_used_();
@@ -103,16 +103,16 @@ void xxx() {
 int prio() { return 81; }
 int prio1() { return 82; }
 
-#pragma omp declare variant(prio) match(implementation = {vendor(score(2): llvm)}, device={kind(cpu,host)})
-#pragma omp declare variant(prio1) match(implementation = {vendor(score(1): llvm)}, device={kind(cpu)})
+#pragma omp declare variant(prio1) match(implementation = {vendor(score(2): llvm)}, device={kind(cpu,host)})
+#pragma omp declare variant(prio) match(implementation = {vendor(score(1): llvm)}, device={kind(cpu)})
 int prio_() { return 1; }
 
 static int prio2() { return 83; }
 static int prio3() { return 84; }
 static int prio4() { return 84; }
 
-#pragma omp declare variant(prio4) match(implementation = {vendor(score(8): llvm)},device={kind(cpu,host)})
-#pragma omp declare variant(prio2) match(implementation = {vendor(score(5): llvm)})
+#pragma omp declare variant(prio4) match(implementation = {vendor(score(5): llvm)})
+#pragma omp declare variant(prio2) match(implementation = {vendor(score(8): llvm)}, device={kind(cpu,host)})
 #pragma omp declare variant(prio3) match(implementation = {vendor(score(7): llvm)}, device={kind(cpu)})
 static int prio1_() { return 1; }
 
@@ -137,7 +137,7 @@ int fn_variant2() { return 1; }
 #pragma omp declare variant(fn_variant2) match(implementation = {vendor(llvm)}, device={kind(fpga)})
 int fn2() { return 87; }
 
-#pragma omp declare variant(stat_unused_no_emit) match(implementation = {vendor(xxx)}, device={kind(gpu)})
+#pragma omp declare variant(stat_unused_no_emit) match(implementation = {vendor(unknown)}, device = {kind(gpu)})
 template <typename T>
 static T stat_unused_T() { return 88; }
 
diff --git a/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp b/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp
index 7f84709b80d42..a9ed8f7486822 100644
--- a/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp
@@ -43,13 +43,13 @@
 #define HEADER
 
 #ifdef GPU
-#define CORRECT gpu
-#define SUBSET nohost, gpu
+#define SUBSET gpu
+#define CORRECT nohost, gpu
 #define WRONG cpu, gpu
 #endif // GPU
 #ifdef NOHOST
-#define CORRECT nohost
-#define SUBSET nohost, gpu
+#define SUBSET nohost
+#define CORRECT nohost, gpu
 #define WRONG nohost, host
 #endif // NOHOST
 
diff --git a/clang/test/Preprocessor/Weverything_pragma.c b/clang/test/Preprocessor/Weverything_pragma.c
index 1815f554fffd6..f2cf97ed4a1ca 100644
--- a/clang/test/Preprocessor/Weverything_pragma.c
+++ b/clang/test/Preprocessor/Weverything_pragma.c
@@ -6,7 +6,7 @@
 // but -Weverything forces it
 #define UNUSED_MACRO1 1 // expected-warning{{macro is not used}}
 
-void foo() // expected-warning {{no previous prototype for function}}
+void foo(void) // expected-warning {{no previous prototype for function}}
 // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}}
 {
  // A diagnostic without DefaultIgnore, and not part of a group.
diff --git a/clang/test/Preprocessor/pragma_diagnostic.c b/clang/test/Preprocessor/pragma_diagnostic.c
index 99724623207f1..75d2bbc7190f3 100644
--- a/clang/test/Preprocessor/pragma_diagnostic.c
+++ b/clang/test/Preprocessor/pragma_diagnostic.c
@@ -35,19 +35,19 @@
 #endif
 
 // Testing pragma clang diagnostic with -Weverything
-void ppo(){} // First test that we do not diagnose on this.
+void ppo(void){} // First test that we do not diagnose on this.
 
 #pragma clang diagnostic warning "-Weverything"
-void ppp(){} // expected-warning {{no previous prototype for function 'ppp'}}
+void ppp(void){} // expected-warning {{no previous prototype for function 'ppp'}}
 // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}}
 
 #pragma clang diagnostic ignored "-Weverything" // Reset it.
-void ppq(){}
+void ppq(void){}
 
 #pragma clang diagnostic error "-Weverything" // Now set to error
-void ppr(){} // expected-error {{no previous prototype for function 'ppr'}}
+void ppr(void){} // expected-error {{no previous prototype for function 'ppr'}}
 // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}}
 
 #pragma clang diagnostic warning "-Weverything" // This should not be effective
-void pps(){} // expected-error {{no previous prototype for function 'pps'}}
+void pps(void){} // expected-error {{no previous prototype for function 'pps'}}
 // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}}
diff --git a/clang/test/Preprocessor/pushable-diagnostics.c b/clang/test/Preprocessor/pushable-diagnostics.c
index 4a0dd895a78e4..9eaf87d58f820 100644
--- a/clang/test/Preprocessor/pushable-diagnostics.c
+++ b/clang/test/Preprocessor/pushable-diagnostics.c
@@ -18,28 +18,28 @@ int c = 'df';  // expected-warning{{multi-character character constant}}
 
 // Test -Weverything
 
-void ppo0(){} // first verify that we do not give anything on this
+void ppo0(void){} // first verify that we do not give anything on this
 #pragma clang diagnostic push // now push
 
 #pragma clang diagnostic warning "-Weverything" 
-void ppr1(){} // expected-warning {{no previous prototype for function 'ppr1'}}
+void ppr1(void){} // expected-warning {{no previous prototype for function 'ppr1'}}
 // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}}
 
 #pragma clang diagnostic push // push again
 #pragma clang diagnostic ignored "-Weverything"  // Set to ignore in this level.
-void pps2(){}
+void pps2(void){}
 #pragma clang diagnostic warning "-Weverything"  // Set to warning in this level.
-void ppt2(){} // expected-warning {{no previous prototype for function 'ppt2'}}
+void ppt2(void){} // expected-warning {{no previous prototype for function 'ppt2'}}
 // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}}
 #pragma clang diagnostic error "-Weverything"  // Set to error in this level.
-void ppt3(){} // expected-error {{no previous prototype for function 'ppt3'}}
+void ppt3(void){} // expected-error {{no previous prototype for function 'ppt3'}}
 // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}}
 #pragma clang diagnostic pop // pop should go back to warning level
 
-void pps1(){} // expected-warning {{no previous prototype for function 'pps1'}}
+void pps1(void){} // expected-warning {{no previous prototype for function 'pps1'}}
 // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}}
 
 
 #pragma clang diagnostic pop // Another pop should disble it again
-void ppu(){}
+void ppu(void){}
 
diff --git a/clang/test/Sema/warn-strict-prototypes.c b/clang/test/Sema/warn-strict-prototypes.c
index 5565a09060fc2..50b0f7d060f2e 100644
--- a/clang/test/Sema/warn-strict-prototypes.c
+++ b/clang/test/Sema/warn-strict-prototypes.c
@@ -1,15 +1,18 @@
 // RUN: %clang_cc1 -triple i386-pc-unknown -fsyntax-only -Wstrict-prototypes -Wno-implicit-function-declaration -verify %s
 // RUN: %clang_cc1 -triple i386-pc-unknown -fsyntax-only -Wstrict-prototypes -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
 
+// function definition with 0 params, no prototype, no preceding declaration.
+void foo0() {} // expected-warning {{this old-style function definition is not preceded by a prototype}}
+
 // function declaration with unspecified params
 void foo1(); // expected-warning {{this function declaration is not a prototype}}
              // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:11}:"void"
 // function declaration with 0 params
 void foo2(void);
 
-// function definition with 0 params(for both cases),
-// valid according to 6.7.5.3/14
-void foo1() {}
+// function definition with 0 params, no prototype.
+void foo1() {} // expected-warning {{this old-style function definition is not preceded by a prototype}}
+// function definition with 0 params, prototype.
 void foo2(void) {}
 
 // function type typedef unspecified params
diff --git a/clang/test/Sema/warn-strict-prototypes.cpp b/clang/test/Sema/warn-strict-prototypes.cpp
new file mode 100644
index 0000000000000..6a3839ff93672
--- /dev/null
+++ b/clang/test/Sema/warn-strict-prototypes.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -verify -fsyntax-only -Wstrict-prototypes %s
+// expected-no-diagnostics
+
+void decl();
+void decl_void(void);
+
+void def() {}
+void def_void(void) {}
diff --git a/clang/test/Sema/warn-strict-prototypes.m b/clang/test/Sema/warn-strict-prototypes.m
index 66d574f75f802..e2fde8ee38fc1 100644
--- a/clang/test/Sema/warn-strict-prototypes.m
+++ b/clang/test/Sema/warn-strict-prototypes.m
@@ -10,7 +10,7 @@ @interface Foo
 
 @end
 
-void foo() {
+void foo() { // expected-warning {{this old-style function definition is not preceded by a prototype}}
   void (^block)() = // expected-warning {{this block declaration is not a prototype}}
                     ^void(int arg) { // no warning
   };
diff --git a/clang/test/Sema/warn-unused-parameters.c b/clang/test/Sema/warn-unused-parameters.c
index 11db7300c5c43..d325f887f885a 100644
--- a/clang/test/Sema/warn-unused-parameters.c
+++ b/clang/test/Sema/warn-unused-parameters.c
@@ -7,7 +7,7 @@ int f0(int x,
   return x;
 }
 
-void f1() {
+void f1(void) {
   (void)^(int x,
           int y,
           int z __attribute__((unused))) { return x; };
diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/def-and-decl.c b/clang/test/utils/update_cc_test_checks/Inputs/def-and-decl.c
similarity index 100%
rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/def-and-decl.c
rename to clang/test/utils/update_cc_test_checks/Inputs/def-and-decl.c
diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/def-and-decl.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/def-and-decl.c.expected
similarity index 100%
rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/def-and-decl.c.expected
rename to clang/test/utils/update_cc_test_checks/Inputs/def-and-decl.c.expected
diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c
similarity index 100%
rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c
rename to clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c
diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected
similarity index 100%
rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c.expected
rename to clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected
diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected
similarity index 100%
rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected
rename to clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected
diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/def-and-decl.test b/clang/test/utils/update_cc_test_checks/def-and-decl.test
similarity index 100%
rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/def-and-decl.test
rename to clang/test/utils/update_cc_test_checks/def-and-decl.test
diff --git a/clang/test/utils/update_cc_test_checks/lit.local.cfg b/clang/test/utils/update_cc_test_checks/lit.local.cfg
new file mode 100644
index 0000000000000..0250446423cb5
--- /dev/null
+++ b/clang/test/utils/update_cc_test_checks/lit.local.cfg
@@ -0,0 +1,25 @@
+import os
+
+import lit.util
+
+# python 2.7 backwards compatibility
+try:
+    from shlex import quote as shell_quote
+except ImportError:
+    from pipes import quote as shell_quote
+
+
+config.test_format = lit.formats.ShTest(execute_external=False)
+config.suffixes = ['.test']
+
+clang_path = os.path.join(config.clang_tools_dir, 'clang')
+extra_args = '--clang ' + shell_quote(clang_path)
+opt_path = os.path.join(config.llvm_tools_dir, 'opt')
+extra_args += ' --opt ' + shell_quote(opt_path)
+script_path = os.path.join(config.llvm_src_root, 'utils',
+                           'update_cc_test_checks.py')
+assert os.path.isfile(script_path)
+config.substitutions.append(
+    ('%update_cc_test_checks', "%s %s %s" % (
+        shell_quote(config.python_executable), shell_quote(script_path),
+        extra_args)))
diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/mangled_names.test b/clang/test/utils/update_cc_test_checks/mangled_names.test
similarity index 100%
rename from llvm/test/tools/UpdateTestChecks/update_cc_test_checks/mangled_names.test
rename to clang/test/utils/update_cc_test_checks/mangled_names.test
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index 50788cb7cf8b6..9105c616786fb 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
 #include <mutex>
 #include <thread>
@@ -484,14 +485,9 @@ int main(int argc, const char **argv) {
 
   DependencyScanningService Service(ScanMode, Format, ReuseFileManager,
                                     SkipExcludedPPRanges);
-#if LLVM_ENABLE_THREADS
-  unsigned NumWorkers =
-      NumThreads == 0 ? llvm::hardware_concurrency() : NumThreads;
-#else
-  unsigned NumWorkers = 1;
-#endif
+  llvm::ThreadPool Pool(llvm::hardware_concurrency(NumThreads));
   std::vector<std::unique_ptr<DependencyScanningTool>> WorkerTools;
-  for (unsigned I = 0; I < NumWorkers; ++I)
+  for (unsigned I = 0; I < Pool.getThreadCount(); ++I)
     WorkerTools.push_back(std::make_unique<DependencyScanningTool>(Service));
 
   std::vector<SingleCommandCompilationDatabase> Inputs;
@@ -499,7 +495,6 @@ int main(int argc, const char **argv) {
        AdjustingCompilations->getAllCompileCommands())
     Inputs.emplace_back(Cmd);
 
-  std::vector<std::thread> WorkerThreads;
   std::atomic<bool> HadErrors(false);
   FullDeps FD;
   std::mutex Lock;
@@ -507,11 +502,11 @@ int main(int argc, const char **argv) {
 
   if (Verbose) {
     llvm::outs() << "Running clang-scan-deps on " << Inputs.size()
-                 << " files using " << NumWorkers << " workers\n";
+                 << " files using " << Pool.getThreadCount() << " workers\n";
   }
-  for (unsigned I = 0; I < NumWorkers; ++I) {
-    auto Worker = [I, &Lock, &Index, &Inputs, &HadErrors, &FD, &WorkerTools,
-                   &DependencyOS, &Errs]() {
+  for (unsigned I = 0; I < Pool.getThreadCount(); ++I) {
+    Pool.async([I, &Lock, &Index, &Inputs, &HadErrors, &FD, &WorkerTools,
+                &DependencyOS, &Errs]() {
       llvm::StringSet<> AlreadySeenModules;
       while (true) {
         const SingleCommandCompilationDatabase *Input;
@@ -543,16 +538,9 @@ int main(int argc, const char **argv) {
             HadErrors = true;
         }
       }
-    };
-#if LLVM_ENABLE_THREADS
-    WorkerThreads.emplace_back(std::move(Worker));
-#else
-    // Run the worker without spawning a thread when threads are disabled.
-    Worker();
-#endif
+    });
   }
-  for (auto &W : WorkerThreads)
-    W.join();
+  Pool.wait();
 
   if (Format == ScanningOutputFormat::Full)
     FD.printFullOutput(llvm::outs());
diff --git a/clang/tools/libclang/CXIndexDataConsumer.cpp b/clang/tools/libclang/CXIndexDataConsumer.cpp
index ad871228ccdfb..fb04a06f8ae7e 100644
--- a/clang/tools/libclang/CXIndexDataConsumer.cpp
+++ b/clang/tools/libclang/CXIndexDataConsumer.cpp
@@ -1245,6 +1245,9 @@ static CXIdxEntityKind getEntityKindFromSymbolKind(SymbolKind K, SymbolLanguage
   case SymbolKind::Macro:
   case SymbolKind::ClassProperty:
   case SymbolKind::Using:
+  case SymbolKind::TemplateTypeParm:
+  case SymbolKind::TemplateTemplateParm:
+  case SymbolKind::NonTypeTemplateParm:
     return CXIdxEntity_Unexposed;
 
   case SymbolKind::Enum: return CXIdxEntity_Enum;
diff --git a/clang/unittests/Index/IndexTests.cpp b/clang/unittests/Index/IndexTests.cpp
index a279f48fbb375..068b30ebfa8af 100644
--- a/clang/unittests/Index/IndexTests.cpp
+++ b/clang/unittests/Index/IndexTests.cpp
@@ -249,8 +249,13 @@ TEST(IndexTest, IndexTypeParmDecls) {
   Index->Symbols.clear();
   tooling::runToolOnCode(std::make_unique<IndexAction>(Index, Opts), Code);
   EXPECT_THAT(Index->Symbols,
-              AllOf(Contains(QName("Foo::T")), Contains(QName("Foo::I")),
-                    Contains(QName("Foo::C")), Contains(QName("Foo::NoRef"))));
+              AllOf(Contains(AllOf(QName("Foo::T"),
+                                   Kind(SymbolKind::TemplateTypeParm))),
+                    Contains(AllOf(QName("Foo::I"),
+                                   Kind(SymbolKind::NonTypeTemplateParm))),
+                    Contains(AllOf(QName("Foo::C"),
+                                   Kind(SymbolKind::TemplateTemplateParm))),
+                    Contains(QName("Foo::NoRef"))));
 }
 
 TEST(IndexTest, UsingDecls) {
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index c3b22e067c9be..ba825a90edb29 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -107,6 +107,7 @@ static std::string ReadPCHRecord(StringRef type) {
       .Case("IdentifierInfo *", "Record.readIdentifier()")
       .Case("StringRef", "Record.readString()")
       .Case("ParamIdx", "ParamIdx::deserialize(Record.readInt())")
+      .Case("OMPTraitInfo *", "Record.readOMPTraitInfo()")
       .Default("Record.readInt()");
 }
 
@@ -130,6 +131,8 @@ static std::string WritePCHRecord(StringRef type, StringRef name) {
              .Case("StringRef", "AddString(" + std::string(name) + ");\n")
              .Case("ParamIdx",
                    "push_back(" + std::string(name) + ".serialize());\n")
+             .Case("OMPTraitInfo *",
+                   "writeOMPTraitInfo(" + std::string(name) + ");\n")
              .Default("push_back(" + std::string(name) + ");\n");
 }
 
@@ -338,7 +341,7 @@ namespace {
     void writeDump(raw_ostream &OS) const override {
       if (type == "FunctionDecl *" || type == "NamedDecl *") {
         OS << "    OS << \" \";\n";
-        OS << "    dumpBareDeclRef(SA->get" << getUpperName() << "());\n"; 
+        OS << "    dumpBareDeclRef(SA->get" << getUpperName() << "());\n";
       } else if (type == "IdentifierInfo *") {
         // Some non-optional (comma required) identifier arguments can be the
         // empty string but are then recorded as a nullptr.
@@ -360,6 +363,8 @@ namespace {
           OS << "    if (SA->get" << getUpperName() << "().isValid())\n  ";
         OS << "    OS << \" \" << SA->get" << getUpperName()
            << "().getSourceIndex();\n";
+      } else if (type == "OMPTraitInfo *") {
+        OS << "    OS << \" \" << *SA->get" << getUpperName() << "();\n";
       } else {
         llvm_unreachable("Unknown SimpleArgument type!");
       }
@@ -500,7 +505,7 @@ namespace {
       OS << "  if (is" << getLowerName() << "Expr)\n";
       OS << "    return " << getLowerName() << "Expr && (" << getLowerName()
          << "Expr->isValueDependent() || " << getLowerName()
-         << "Expr->isTypeDependent());\n"; 
+         << "Expr->isTypeDependent());\n";
       OS << "  else\n";
       OS << "    return " << getLowerName()
          << "Type->getType()->isDependentType();\n";
@@ -525,11 +530,11 @@ namespace {
     void writeASTVisitorTraversal(raw_ostream &OS) const override {
       StringRef Name = getUpperName();
       OS << "  if (A->is" << Name << "Expr()) {\n"
-         << "    if (!getDerived().TraverseStmt(A->get" << Name << "Expr()))\n" 
-         << "      return false;\n" 
+         << "    if (!getDerived().TraverseStmt(A->get" << Name << "Expr()))\n"
+         << "      return false;\n"
          << "  } else if (auto *TSI = A->get" << Name << "Type()) {\n"
          << "    if (!getDerived().TraverseTypeLoc(TSI->getTypeLoc()))\n"
-         << "      return false;\n" 
+         << "      return false;\n"
          << "  }\n";
     }
 
@@ -658,7 +663,7 @@ namespace {
       std::string IteratorType = getLowerName().str() + "_iterator";
       std::string BeginFn = getLowerName().str() + "_begin()";
       std::string EndFn = getLowerName().str() + "_end()";
-      
+
       OS << "  typedef " << Type << "* " << IteratorType << ";\n";
       OS << "  " << IteratorType << " " << BeginFn << " const {"
          << " return " << ArgName << "; }\n";
@@ -915,14 +920,14 @@ namespace {
       for (size_t I = 0; I < enums.size(); ++I) {
         if (Uniques.insert(enums[I]).second)
           OS << "    case " << getAttrName() << "Attr::" << enums[I]
-             << ": return \"" << values[I] << "\";\n";       
+             << ": return \"" << values[I] << "\";\n";
       }
       OS << "    }\n"
          << "    llvm_unreachable(\"No enumerator with that value\");\n"
          << "  }\n";
     }
   };
-  
+
   class VariadicEnumArgument: public VariadicArgument {
     std::string type, QualifiedTypeName;
     std::vector<StringRef> values, enums, uniques;
@@ -945,13 +950,13 @@ namespace {
           enums(Arg.getValueAsListOfStrings("Enums")),
           uniques(uniqueEnumsInOrder(enums)) {
       QualifiedTypeName = getAttrName().str() + "Attr::" + type;
-      
+
       // FIXME: Emit a proper error
       assert(!uniques.empty());
     }
 
     bool isVariadicEnumArg() const override { return true; }
-    
+
     void writeDeclarations(raw_ostream &OS) const override {
       auto i = uniques.cbegin(), e = uniques.cend();
       // The last one needs to not have a comma.
@@ -964,7 +969,7 @@ namespace {
       OS << "    " << *e << "\n";
       OS << "  };\n";
       OS << "private:\n";
-      
+
       VariadicArgument::writeDeclarations(OS);
     }
 
@@ -1041,7 +1046,7 @@ namespace {
       OS << "  VersionTuple get" << getUpperName() << "() const {\n";
       OS << "    return " << getLowerName() << ";\n";
       OS << "  }\n";
-      OS << "  void set" << getUpperName() 
+      OS << "  void set" << getUpperName()
          << "(ASTContext &C, VersionTuple V) {\n";
       OS << "    " << getLowerName() << " = V;\n";
       OS << "  }";
@@ -1308,6 +1313,8 @@ createArgument(const Record &Arg, StringRef Attr,
     Ptr = std::make_unique<VariadicIdentifierArgument>(Arg, Attr);
   else if (ArgName == "VersionArgument")
     Ptr = std::make_unique<VersionArgument>(Arg, Attr);
+  else if (ArgName == "OMPTraitInfoArgument")
+    Ptr = std::make_unique<SimpleArgument>(Arg, Attr, "OMPTraitInfo *");
 
   if (!Ptr) {
     // Search in reverse order so that the most-derived type is handled first.
@@ -2252,10 +2259,10 @@ void clang::EmitClangAttrClass(RecordKeeper &Records, raw_ostream &OS) {
     // When attribute documentation can be generated as part of the build
     // itself, this code can be removed.
     (void)R.getValueAsListOfDefs("Documentation");
-    
+
     if (!R.getValueAsBit("ASTNode"))
       continue;
-    
+
     ArrayRef<std::pair<Record *, SMRange>> Supers = R.getSuperClasses();
     assert(!Supers.empty() && "Forgot to specify a superclass for the attr");
     std::string SuperName;
@@ -2437,7 +2444,7 @@ void clang::EmitClangAttrClass(RecordKeeper &Records, raw_ostream &OS) {
       }
 
       OS << "  {\n";
-  
+
       for (auto const &ai : Args) {
         if (!shouldEmitArg(ai)) continue;
         ai->writeCtorBody(OS);
@@ -2452,7 +2459,7 @@ void clang::EmitClangAttrClass(RecordKeeper &Records, raw_ostream &OS) {
     // Emit a constructor that takes all the non-fake arguments.
     if (HasFakeArg)
       emitCtor(true, false);
- 
+
     // Emit a constructor that takes all the non-fake, non-optional arguments.
     if (HasOptArg)
       emitCtor(false, false);
@@ -2461,7 +2468,7 @@ void clang::EmitClangAttrClass(RecordKeeper &Records, raw_ostream &OS) {
     OS << "  void printPretty(raw_ostream &OS,\n"
        << "                   const PrintingPolicy &Policy) const;\n";
     OS << "  const char *getSpelling() const;\n";
-    
+
     if (!ElideSpelling) {
       assert(!SemanticToSyntacticMap.empty() && "Empty semantic mapping list");
       OS << "  Spelling getSemanticSpelling() const {\n";
@@ -2506,7 +2513,7 @@ void clang::EmitClangAttrImpl(RecordKeeper &Records, raw_ostream &OS) {
 
   for (auto *Attr : Attrs) {
     Record &R = *Attr;
-    
+
     if (!R.getValueAsBit("ASTNode"))
       continue;
 
@@ -2978,7 +2985,7 @@ static void GenerateHasAttrSpellingStringSwitch(
     // them. If the attribute has no scope, the version information must not
     // have the default value (1), as that's incorrect. Instead, the unscoped
     // attribute version information should be taken from the SD-6 standing
-    // document, which can be found at: 
+    // document, which can be found at:
     // https://isocpp.org/std/standing-documents/sd-6-sg10-feature-test-recommendations
     int Version = 1;
 
@@ -3270,7 +3277,7 @@ void EmitClangAttrParsedAttrList(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#ifndef PARSED_ATTR\n";
   OS << "#define PARSED_ATTR(NAME) NAME\n";
   OS << "#endif\n\n";
-  
+
   ParsedAttrMap Names = getParsedAttrList(Records);
   for (const auto &I : Names) {
     OS << "PARSED_ATTR(" << I.first << ")\n";
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 6c4856f05a2dc..21af345dc6bb8 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -342,21 +342,33 @@ if(APPLE)
 
   if(COMPILER_RT_ENABLE_IOS)
     list(APPEND DARWIN_EMBEDDED_PLATFORMS ios)
+    set(DARWIN_ios_MIN_VER 9.0)
     set(DARWIN_ios_MIN_VER_FLAG -miphoneos-version-min)
     set(DARWIN_ios_SANITIZER_MIN_VER_FLAG
-      ${DARWIN_ios_MIN_VER_FLAG}=9.0)
+      ${DARWIN_ios_MIN_VER_FLAG}=${DARWIN_ios_MIN_VER})
+    set(DARWIN_iossim_MIN_VER_FLAG -mios-simulator-version-min)
+    set(DARWIN_iossim_SANITIZER_MIN_VER_FLAG
+      ${DARWIN_iossim_MIN_VER_FLAG}=${DARWIN_ios_MIN_VER})
   endif()
   if(COMPILER_RT_ENABLE_WATCHOS)
     list(APPEND DARWIN_EMBEDDED_PLATFORMS watchos)
+    set(DARWIN_watchos_MIN_VER 2.0)
     set(DARWIN_watchos_MIN_VER_FLAG -mwatchos-version-min)
     set(DARWIN_watchos_SANITIZER_MIN_VER_FLAG
-      ${DARWIN_watchos_MIN_VER_FLAG}=2.0)
+      ${DARWIN_watchos_MIN_VER_FLAG}=${DARWIN_watchos_MIN_VER})
+    set(DARWIN_watchossim_MIN_VER_FLAG -mwatchos-simulator-version-min)
+    set(DARWIN_watchossim_SANITIZER_MIN_VER_FLAG
+      ${DARWIN_watchossim_MIN_VER_FLAG}=${DARWIN_watchos_MIN_VER})
   endif()
   if(COMPILER_RT_ENABLE_TVOS)
     list(APPEND DARWIN_EMBEDDED_PLATFORMS tvos)
+    set(DARWIN_tvos_MIN_VER 9.0)
     set(DARWIN_tvos_MIN_VER_FLAG -mtvos-version-min)
     set(DARWIN_tvos_SANITIZER_MIN_VER_FLAG
-      ${DARWIN_tvos_MIN_VER_FLAG}=9.0)
+      ${DARWIN_tvos_MIN_VER_FLAG}=${DARWIN_tvos_MIN_VER})
+    set(DARWIN_tvossim_MIN_VER_FLAG -mtvos-simulator-version-min)
+    set(DARWIN_tvossim_SANITIZER_MIN_VER_FLAG
+      ${DARWIN_tvossim_MIN_VER_FLAG}=${DARWIN_tvos_MIN_VER})
   endif()
 
   set(SANITIZER_COMMON_SUPPORTED_OS osx)
@@ -368,8 +380,9 @@ if(APPLE)
   # Note: In order to target x86_64h on OS X the minimum deployment target must
   # be 10.8 or higher.
   set(DEFAULT_SANITIZER_MIN_OSX_VERSION 10.10)
+  set(DARWIN_osx_MIN_VER_FLAG "-mmacosx-version-min")
   if(NOT SANITIZER_MIN_OSX_VERSION)
-    string(REGEX MATCH "-mmacosx-version-min=([.0-9]+)"
+    string(REGEX MATCH "${DARWIN_osx_MIN_VER_FLAG}=([.0-9]+)"
            MACOSX_VERSION_MIN_FLAG "${CMAKE_CXX_FLAGS}")
     if(MACOSX_VERSION_MIN_FLAG)
       set(SANITIZER_MIN_OSX_VERSION "${CMAKE_MATCH_1}")
@@ -403,10 +416,10 @@ if(APPLE)
 
   set(DARWIN_osx_CFLAGS
     ${DARWIN_COMMON_CFLAGS}
-    -mmacosx-version-min=${SANITIZER_MIN_OSX_VERSION})
+    ${DARWIN_osx_MIN_VER_FLAG}=${SANITIZER_MIN_OSX_VERSION})
   set(DARWIN_osx_LINK_FLAGS
     ${DARWIN_COMMON_LINK_FLAGS}
-    -mmacosx-version-min=${SANITIZER_MIN_OSX_VERSION})
+    ${DARWIN_osx_MIN_VER_FLAG}=${SANITIZER_MIN_OSX_VERSION})
 
   if(DARWIN_osx_SYSROOT)
     list(APPEND DARWIN_osx_CFLAGS -isysroot ${DARWIN_osx_SYSROOT})
@@ -431,11 +444,11 @@ if(APPLE)
       if(DARWIN_${platform}sim_SYSROOT)
         set(DARWIN_${platform}sim_CFLAGS
           ${DARWIN_COMMON_CFLAGS}
-          ${DARWIN_${platform}_SANITIZER_MIN_VER_FLAG}
+          ${DARWIN_${platform}sim_SANITIZER_MIN_VER_FLAG}
           -isysroot ${DARWIN_${platform}sim_SYSROOT})
         set(DARWIN_${platform}sim_LINK_FLAGS
           ${DARWIN_COMMON_LINK_FLAGS}
-          ${DARWIN_${platform}_SANITIZER_MIN_VER_FLAG}
+          ${DARWIN_${platform}sim_SANITIZER_MIN_VER_FLAG}
           -isysroot ${DARWIN_${platform}sim_SYSROOT})
 
         set(DARWIN_${platform}sim_SKIP_CC_KEXT On)
@@ -487,6 +500,10 @@ if(APPLE)
     endforeach()
   endif()
 
+  # Explictly disable unsupported Sanitizer configurations.
+  list(REMOVE_ITEM FUZZER_SUPPORTED_OS "watchos")
+  list(REMOVE_ITEM FUZZER_SUPPORTED_OS "watchossim")
+
   # for list_intersect
   include(CompilerRTUtils)
 
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h
index 3d338501ae4ae..ad2a17ef7014a 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.h
@@ -40,15 +40,15 @@ struct AndroidConfig {
   using SizeClassMap = AndroidSizeClassMap;
 #if SCUDO_CAN_USE_PRIMARY64
   // 256MB regions
-  typedef SizeClassAllocator64<SizeClassMap, 28U,
+  typedef SizeClassAllocator64<SizeClassMap, 28U, 1000, 1000,
                                /*MaySupportMemoryTagging=*/true>
       Primary;
 #else
   // 256KB regions
-  typedef SizeClassAllocator32<SizeClassMap, 18U> Primary;
+  typedef SizeClassAllocator32<SizeClassMap, 18U, 1000, 1000> Primary;
 #endif
   // Cache blocks up to 2MB
-  typedef MapAllocator<MapAllocatorCache<32U, 2UL << 20>> Secondary;
+  typedef MapAllocator<MapAllocatorCache<32U, 2UL << 20, 0, 1000>> Secondary;
   template <class A>
   using TSDRegistryT = TSDRegistrySharedT<A, 2U>; // Shared, max 2 TSDs.
 };
@@ -57,12 +57,12 @@ struct AndroidSvelteConfig {
   using SizeClassMap = SvelteSizeClassMap;
 #if SCUDO_CAN_USE_PRIMARY64
   // 128MB regions
-  typedef SizeClassAllocator64<SizeClassMap, 27U> Primary;
+  typedef SizeClassAllocator64<SizeClassMap, 27U, 1000, 1000> Primary;
 #else
   // 64KB regions
-  typedef SizeClassAllocator32<SizeClassMap, 16U> Primary;
+  typedef SizeClassAllocator32<SizeClassMap, 16U, 1000, 1000> Primary;
 #endif
-  typedef MapAllocator<MapAllocatorCache<4U, 1UL << 18>> Secondary;
+  typedef MapAllocator<MapAllocatorCache<4U, 1UL << 18, 0, 0>> Secondary;
   template <class A>
   using TSDRegistryT = TSDRegistrySharedT<A, 1U>; // Shared, only 1 TSD.
 };
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index e8390a7b44f16..f49fc9aac84cb 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -32,6 +32,8 @@ extern "C" inline void EmptyCallback() {}
 
 namespace scudo {
 
+enum class Option { ReleaseInterval };
+
 template <class Params, void (*PostInitCallback)(void) = EmptyCallback>
 class Allocator {
 public:
@@ -624,8 +626,14 @@ class Allocator {
     return Options.MayReturnNull;
   }
 
-  // TODO(kostyak): implement this as a "backend" to mallopt.
-  bool setOption(UNUSED uptr Option, UNUSED uptr Value) { return false; }
+  bool setOption(Option O, sptr Value) {
+    if (O == Option::ReleaseInterval) {
+      Primary.setReleaseToOsIntervalMs(static_cast<s32>(Value));
+      Secondary.setReleaseToOsIntervalMs(static_cast<s32>(Value));
+      return true;
+    }
+    return false;
+  }
 
   // Return the usable size for a given chunk. Technically we lie, as we just
   // report the actual size of a chunk. This is done to counteract code actively
diff --git a/compiler-rt/lib/scudo/standalone/flags.inc b/compiler-rt/lib/scudo/standalone/flags.inc
index 27aa969e608ac..342af1c79ad64 100644
--- a/compiler-rt/lib/scudo/standalone/flags.inc
+++ b/compiler-rt/lib/scudo/standalone/flags.inc
@@ -45,6 +45,6 @@ SCUDO_FLAG(bool, may_return_null, true,
            "returning NULL in otherwise non-fatal error scenarios, eg: OOM, "
            "invalid allocation alignments, etc.")
 
-SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? 1000 : 5000,
+SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? INT32_MIN : 5000,
            "Interval (in milliseconds) at which to attempt release of unused "
            "memory to the OS. Negative values disable the feature.")
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 294043930e862..79345cb348b64 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -38,14 +38,18 @@ namespace scudo {
 // Memory used by this allocator is never unmapped but can be partially
 // reclaimed if the platform allows for it.
 
-template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
+template <class SizeClassMapT, uptr RegionSizeLog,
+          s32 MinReleaseToOsIntervalMs = INT32_MIN,
+          s32 MaxReleaseToOsIntervalMs = INT32_MAX> class SizeClassAllocator32 {
 public:
   typedef SizeClassMapT SizeClassMap;
   // The bytemap can only track UINT8_MAX - 1 classes.
   static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
   // Regions should be large enough to hold the largest Block.
   static_assert((1UL << RegionSizeLog) >= SizeClassMap::MaxSize, "");
-  typedef SizeClassAllocator32<SizeClassMapT, RegionSizeLog> ThisT;
+  typedef SizeClassAllocator32<SizeClassMapT, RegionSizeLog,
+                               MinReleaseToOsIntervalMs,
+                               MaxReleaseToOsIntervalMs> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
   static const bool SupportsMemoryTagging = false;
@@ -78,7 +82,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
       Sci->CanRelease = (I != SizeClassMap::BatchClassId) &&
                         (getSizeByClassId(I) >= (PageSize / 32));
     }
-    ReleaseToOsIntervalMs = ReleaseToOsInterval;
+    setReleaseToOsIntervalMs(ReleaseToOsInterval);
   }
   void init(s32 ReleaseToOsInterval) {
     memset(this, 0, sizeof(*this));
@@ -176,6 +180,15 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
       getStats(Str, I, 0);
   }
 
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    if (Interval >= MaxReleaseToOsIntervalMs) {
+      Interval = MaxReleaseToOsIntervalMs;
+    } else if (Interval <= MinReleaseToOsIntervalMs) {
+      Interval = MinReleaseToOsIntervalMs;
+    }
+    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
+  }
+
   uptr releaseToOS() {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
@@ -356,6 +369,10 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
                 AvailableChunks, Rss >> 10, Sci->ReleaseInfo.RangesReleased);
   }
 
+  s32 getReleaseToOsIntervalMs() {
+    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
+  }
+
   NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId,
                                  bool Force = false) {
     const uptr BlockSize = getSizeByClassId(ClassId);
@@ -374,7 +391,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
     }
 
     if (!Force) {
-      const s32 IntervalMs = ReleaseToOsIntervalMs;
+      const s32 IntervalMs = getReleaseToOsIntervalMs();
       if (IntervalMs < 0)
         return 0;
       if (Sci->ReleaseInfo.LastReleaseAtNs +
@@ -414,7 +431,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
   // through the whole NumRegions.
   uptr MinRegionIndex;
   uptr MaxRegionIndex;
-  s32 ReleaseToOsIntervalMs;
+  atomic_s32 ReleaseToOsIntervalMs;
   // Unless several threads request regions simultaneously from different size
   // classes, the stash rarely contains more than 1 entry.
   static constexpr uptr MaxStashedRegions = 4;
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 9d8dcac6562a0..bc31db88ebb8b 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -40,11 +40,15 @@ namespace scudo {
 // released if the platform allows for it.
 
 template <class SizeClassMapT, uptr RegionSizeLog,
+          s32 MinReleaseToOsIntervalMs = INT32_MIN,
+          s32 MaxReleaseToOsIntervalMs = INT32_MAX,
           bool MaySupportMemoryTagging = false>
 class SizeClassAllocator64 {
 public:
   typedef SizeClassMapT SizeClassMap;
   typedef SizeClassAllocator64<SizeClassMap, RegionSizeLog,
+                               MinReleaseToOsIntervalMs,
+                               MaxReleaseToOsIntervalMs,
                                MaySupportMemoryTagging>
       ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
@@ -90,7 +94,7 @@ class SizeClassAllocator64 {
                            (getSizeByClassId(I) >= (PageSize / 32));
       Region->RandState = getRandomU32(&Seed);
     }
-    ReleaseToOsIntervalMs = ReleaseToOsInterval;
+    setReleaseToOsIntervalMs(ReleaseToOsInterval);
 
     if (SupportsMemoryTagging)
       UseMemoryTagging = systemSupportsMemoryTagging();
@@ -186,6 +190,15 @@ class SizeClassAllocator64 {
       getStats(Str, I, 0);
   }
 
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    if (Interval >= MaxReleaseToOsIntervalMs) {
+      Interval = MaxReleaseToOsIntervalMs;
+    } else if (Interval <= MinReleaseToOsIntervalMs) {
+      Interval = MinReleaseToOsIntervalMs;
+    }
+    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
+  }
+
   uptr releaseToOS() {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
@@ -241,7 +254,7 @@ class SizeClassAllocator64 {
   uptr PrimaryBase;
   RegionInfo *RegionInfoArray;
   MapPlatformData Data;
-  s32 ReleaseToOsIntervalMs;
+  atomic_s32 ReleaseToOsIntervalMs;
   bool UseMemoryTagging;
 
   RegionInfo *getRegionInfo(uptr ClassId) const {
@@ -375,6 +388,10 @@ class SizeClassAllocator64 {
                 getRegionBaseByClassId(ClassId));
   }
 
+  s32 getReleaseToOsIntervalMs() {
+    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
+  }
+
   NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId,
                                  bool Force = false) {
     const uptr BlockSize = getSizeByClassId(ClassId);
@@ -394,7 +411,7 @@ class SizeClassAllocator64 {
     }
 
     if (!Force) {
-      const s32 IntervalMs = ReleaseToOsIntervalMs;
+      const s32 IntervalMs = getReleaseToOsIntervalMs();
       if (IntervalMs < 0)
         return 0;
       if (Region->ReleaseInfo.LastReleaseAtNs +
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index deba7a930d986..8ae8108b2eaad 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -62,7 +62,9 @@ class MapAllocatorNoCache {
   void releaseToOS() {}
 };
 
-template <uptr MaxEntriesCount = 32U, uptr MaxEntrySize = 1UL << 19>
+template <uptr MaxEntriesCount = 32U, uptr MaxEntrySize = 1UL << 19,
+          s32 MinReleaseToOsIntervalMs = INT32_MIN,
+          s32 MaxReleaseToOsIntervalMs = INT32_MAX>
 class MapAllocatorCache {
 public:
   // Fuchsia doesn't allow releasing Secondary blocks yet. Note that 0 length
@@ -71,7 +73,7 @@ class MapAllocatorCache {
   static_assert(!SCUDO_FUCHSIA || MaxEntriesCount == 0U, "");
 
   void initLinkerInitialized(s32 ReleaseToOsInterval) {
-    ReleaseToOsIntervalMs = ReleaseToOsInterval;
+    setReleaseToOsIntervalMs(ReleaseToOsInterval);
   }
   void init(s32 ReleaseToOsInterval) {
     memset(this, 0, sizeof(*this));
@@ -105,11 +107,11 @@ class MapAllocatorCache {
         }
       }
     }
+    s32 Interval;
     if (EmptyCache)
       empty();
-    else if (ReleaseToOsIntervalMs >= 0)
-      releaseOlderThan(Time -
-                       static_cast<u64>(ReleaseToOsIntervalMs) * 1000000);
+    else if ((Interval = getReleaseToOsIntervalMs()) >= 0)
+      releaseOlderThan(Time - static_cast<u64>(Interval) * 1000000);
     return EntryCached;
   }
 
@@ -142,6 +144,15 @@ class MapAllocatorCache {
     return MaxEntriesCount != 0U && Size <= MaxEntrySize;
   }
 
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    if (Interval >= MaxReleaseToOsIntervalMs) {
+      Interval = MaxReleaseToOsIntervalMs;
+    } else if (Interval <= MinReleaseToOsIntervalMs) {
+      Interval = MinReleaseToOsIntervalMs;
+    }
+    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
+  }
+
   void releaseToOS() { releaseOlderThan(UINT64_MAX); }
 
   void disable() { Mutex.lock(); }
@@ -189,6 +200,10 @@ class MapAllocatorCache {
     }
   }
 
+  s32 getReleaseToOsIntervalMs() {
+    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
+  }
+
   struct CachedBlock {
     uptr Block;
     uptr BlockEnd;
@@ -203,7 +218,7 @@ class MapAllocatorCache {
   u32 EntriesCount;
   uptr LargestSize;
   u32 IsFullEvents;
-  s32 ReleaseToOsIntervalMs;
+  atomic_s32 ReleaseToOsIntervalMs;
 };
 
 template <class CacheT> class MapAllocator {
@@ -251,6 +266,10 @@ template <class CacheT> class MapAllocator {
 
   static uptr canCache(uptr Size) { return CacheT::canCache(Size); }
 
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    Cache.setReleaseToOsIntervalMs(Interval);
+  }
+
   void releaseToOS() { Cache.releaseToOS(); }
 
 private:
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
index 91f615dcb8f84..314a835074e64 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
@@ -157,7 +157,18 @@ void SCUDO_PREFIX(malloc_postinit)() {
 
 INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, UNUSED int value) {
   if (param == M_DECAY_TIME) {
-    // TODO(kostyak): set release_to_os_interval_ms accordingly.
+    if (SCUDO_ANDROID) {
+      if (value == 0) {
+        // Will set the release values to their minimum values.
+        value = INT32_MIN;
+      } else {
+        // Will set the release values to their maximum values.
+        value = INT32_MAX;
+      }
+    }
+
+    SCUDO_ALLOCATOR.setOption(scudo::Option::ReleaseInterval,
+                              static_cast<scudo::sptr>(value));
     return 1;
   } else if (param == M_PURGE) {
     SCUDO_ALLOCATOR.releaseToOS();
diff --git a/compiler-rt/test/asan/CMakeLists.txt b/compiler-rt/test/asan/CMakeLists.txt
index f756064f47e05..1c2633eb4597b 100644
--- a/compiler-rt/test/asan/CMakeLists.txt
+++ b/compiler-rt/test/asan/CMakeLists.txt
@@ -44,6 +44,7 @@ endif()
 foreach(arch ${ASAN_TEST_ARCH})
   set(ASAN_TEST_TARGET_ARCH ${arch})
   set(ASAN_TEST_APPLE_PLATFORM "osx")
+  set(ASAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_osx_MIN_VER_FLAG}")
   string(TOLOWER "-${arch}-${OS_NAME}" ASAN_TEST_CONFIG_SUFFIX)
   get_bits_for_arch(${arch} ASAN_TEST_BITS)
   get_test_cc_for_arch(${arch} ASAN_TEST_TARGET_CC ASAN_TEST_TARGET_CFLAGS)
@@ -104,6 +105,7 @@ if(APPLE)
       set(ASAN_TEST_CONFIG_SUFFIX "-${arch}-${platform}")
       set(ASAN_TEST_APPLE_PLATFORM "${platform}")
       set(ASAN_TEST_TARGET_ARCH "${arch}")
+      set(ASAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_${platform}_MIN_VER_FLAG}")
       get_bits_for_arch(${arch} ASAN_TEST_BITS)
       configure_lit_site_cfg(
         ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
diff --git a/compiler-rt/test/asan/lit.site.cfg.py.in b/compiler-rt/test/asan/lit.site.cfg.py.in
index f76b306f8577d..81cebde2029ed 100644
--- a/compiler-rt/test/asan/lit.site.cfg.py.in
+++ b/compiler-rt/test/asan/lit.site.cfg.py.in
@@ -6,6 +6,7 @@ config.target_cflags = "@ASAN_TEST_TARGET_CFLAGS@"
 config.clang = "@ASAN_TEST_TARGET_CC@"
 config.bits = "@ASAN_TEST_BITS@"
 config.apple_platform = "@ASAN_TEST_APPLE_PLATFORM@"
+config.apple_platform_min_deployment_target_flag = "@ASAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG@"
 config.asan_dynamic = @ASAN_TEST_DYNAMIC@
 config.target_arch = "@ASAN_TEST_TARGET_ARCH@"
 
diff --git a/compiler-rt/test/fuzzer/CMakeLists.txt b/compiler-rt/test/fuzzer/CMakeLists.txt
index 5a027bd07b40b..c12a04b6f2702 100644
--- a/compiler-rt/test/fuzzer/CMakeLists.txt
+++ b/compiler-rt/test/fuzzer/CMakeLists.txt
@@ -53,6 +53,7 @@ macro(test_fuzzer stdlib)
 
     set(LIBFUZZER_TEST_TARGET_ARCH ${arch})
     set(LIBFUZZER_TEST_APPLE_PLATFORM "osx")
+    set(LIBFUZZER_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_osx_MIN_VER_FLAG}")
 
     set(LIBFUZZER_TEST_STDLIB ${stdlib})
 
@@ -113,6 +114,7 @@ if (APPLE)
       set(LIBFUZZER_TEST_CONFIG_SUFFIX "-${arch}-${platform}")
       set(LIBFUZZER_TEST_APPLE_PLATFORM "${platform}")
       set(LIBFUZZER_TEST_TARGET_ARCH "${arch}")
+      set(LIBFUZZER_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_${platform}_MIN_VER_FLAG}")
       configure_lit_site_cfg(
         ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
         ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py
diff --git a/compiler-rt/test/fuzzer/lit.site.cfg.py.in b/compiler-rt/test/fuzzer/lit.site.cfg.py.in
index cc6a0908d142c..218688c182c7c 100644
--- a/compiler-rt/test/fuzzer/lit.site.cfg.py.in
+++ b/compiler-rt/test/fuzzer/lit.site.cfg.py.in
@@ -5,6 +5,7 @@ config.target_flags = "@LIBFUZZER_TEST_FLAGS@"
 config.c_compiler = "@LIBFUZZER_TEST_COMPILER@"
 config.stdlib = "@LIBFUZZER_TEST_STDLIB@"
 config.apple_platform = "@LIBFUZZER_TEST_APPLE_PLATFORM@"
+config.apple_platform_min_deployment_target_flag = "@LIBFUZZER_TEST_MIN_DEPLOYMENT_TARGET_FLAG@"
 config.name_suffix = "@LIBFUZZER_TEST_CONFIG_SUFFIX@"
 
 config.osx_sysroot_flag = "@OSX_SYSROOT_FLAG@"
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index bf2190a93dad3..6c4a6f526551e 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -258,6 +258,44 @@
 lit.util.usePlatformSdkOnDarwin(config, lit_config)
 
 if config.host_os == 'Darwin':
+  def get_apple_platform_version_aligned_with(macos_version, apple_platform):
+    """
+      Given a macOS version (`macos_version`) returns the corresponding version for
+      the specified Apple platform if it exists.
+
+      `macos_version` - The macOS version as a string.
+      `apple_platform` - The Apple platform name as a string.
+
+      Returns the corresponding version as a string if it exists, otherwise
+      `None` is returned.
+    """
+    m = re.match(r'^10\.(?P<min>\d+)(\.(?P<patch>\d+))?$', macos_version)
+    if not m:
+      raise Exception('Could not parse macOS version: "{}"'.format(macos_version))
+    ver_min = int(m.group('min'))
+    ver_patch = m.group('patch')
+    if ver_patch:
+      ver_patch = int(ver_patch)
+    else:
+      ver_patch = 0
+    result_str = ''
+    if apple_platform == 'osx':
+      # Drop patch for now.
+      result_str = '10.{}'.format(ver_min)
+    elif apple_platform.startswith('ios') or apple_platform.startswith('tvos'):
+      result_maj = ver_min - 2
+      if result_maj < 1:
+        return None
+      result_str = '{}.{}'.format(result_maj, ver_patch)
+    elif apple_platform.startswith('watch'):
+      result_maj = ver_min - 9
+      if result_maj < 1:
+        return None
+      result_str = '{}.{}'.format(result_maj, ver_patch)
+    else:
+      raise Exception('Unsuported apple platform "{}"'.format(apple_platform))
+    return result_str
+
   osx_version = (10, 0, 0)
   try:
     osx_version = subprocess.check_output(["sw_vers", "-productVersion"])
@@ -288,12 +326,17 @@
   except:
     pass
 
-  config.substitutions.append( ("%macos_min_target_10_11", "-mmacosx-version-min=10.11") )
-
-  isIOS = config.apple_platform != "osx"
+  min_os_aligned_with_osx_10_11 = get_apple_platform_version_aligned_with('10.11', config.apple_platform)
+  min_os_aligned_with_osx_10_11_flag = ''
+  if min_os_aligned_with_osx_10_11:
+    min_os_aligned_with_osx_10_11_flag = '{flag}={version}'.format(
+      flag=config.apple_platform_min_deployment_target_flag,
+      version=min_os_aligned_with_osx_10_11)
+  else:
+    lit_config.warning('Could not find a version of {} that corresponds with macOS 10.11'.format(config.apple_platform))
+  config.substitutions.append( ("%macos_min_target_10_11", min_os_aligned_with_osx_10_11_flag) )
   # rdar://problem/22207160
-  config.substitutions.append( ("%darwin_min_target_with_full_runtime_arc_support",
-      "-miphoneos-version-min=9.0" if isIOS else "-mmacosx-version-min=10.11") )
+  config.substitutions.append( ("%darwin_min_target_with_full_runtime_arc_support", min_os_aligned_with_osx_10_11_flag) )
 
   # 32-bit iOS simulator is deprecated and removed in latest Xcode.
   if config.apple_platform == "iossim":
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 0fb51741783e1..4de8d030070f3 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -29,6 +29,7 @@ set_default("compiler_rt_libdir", "@COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR@")
 set_default("emulator", "@COMPILER_RT_EMULATOR@")
 set_default("asan_shadow_scale", "@COMPILER_RT_ASAN_SHADOW_SCALE@")
 set_default("apple_platform", "osx")
+set_default("apple_platform_min_deployment_target_flag", "-mmacosx-version-min")
 set_default("sanitizer_can_use_cxxabi", @SANITIZER_CAN_USE_CXXABI_PYBOOL@)
 set_default("has_lld", @COMPILER_RT_HAS_LLD_PYBOOL@)
 set_default("can_symbolize", @CAN_SYMBOLIZE@)
diff --git a/compiler-rt/test/tsan/CMakeLists.txt b/compiler-rt/test/tsan/CMakeLists.txt
index 7cc3537660a63..67e20e5ecedb0 100644
--- a/compiler-rt/test/tsan/CMakeLists.txt
+++ b/compiler-rt/test/tsan/CMakeLists.txt
@@ -30,6 +30,7 @@ endif()
 
 foreach(arch ${TSAN_TEST_ARCH})
   set(TSAN_TEST_APPLE_PLATFORM "osx")
+  set(TSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_osx_MIN_VER_FLAG}")
 
   set(TSAN_TEST_TARGET_ARCH ${arch})
   string(TOLOWER "-${arch}" TSAN_TEST_CONFIG_SUFFIX)
@@ -77,6 +78,7 @@ if(APPLE)
       set(TSAN_TEST_CONFIG_SUFFIX "-${arch}-${platform}")
       set(TSAN_TEST_APPLE_PLATFORM "${platform}")
       set(TSAN_TEST_TARGET_ARCH "${arch}")
+      set(TSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_${platform}_MIN_VER_FLAG}")
       configure_lit_site_cfg(
         ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
         ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py
diff --git a/compiler-rt/test/tsan/lit.site.cfg.py.in b/compiler-rt/test/tsan/lit.site.cfg.py.in
index 5e8d610e5df08..c6d453aaee26f 100644
--- a/compiler-rt/test/tsan/lit.site.cfg.py.in
+++ b/compiler-rt/test/tsan/lit.site.cfg.py.in
@@ -4,6 +4,7 @@ config.name_suffix = "@TSAN_TEST_CONFIG_SUFFIX@"
 config.tsan_lit_source_dir = "@TSAN_LIT_SOURCE_DIR@"
 config.has_libcxx = @TSAN_HAS_LIBCXX@
 config.apple_platform = "@TSAN_TEST_APPLE_PLATFORM@"
+config.apple_platform_min_deployment_target_flag = "@TSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG@"
 config.target_cflags = "@TSAN_TEST_TARGET_CFLAGS@"
 config.target_arch = "@TSAN_TEST_TARGET_ARCH@"
 config.deflake_threshold = "@TSAN_TEST_DEFLAKE_THRESHOLD@"
diff --git a/compiler-rt/test/ubsan/CMakeLists.txt b/compiler-rt/test/ubsan/CMakeLists.txt
index 1ef554f0a88ae..f7ca0e5c04bb1 100644
--- a/compiler-rt/test/ubsan/CMakeLists.txt
+++ b/compiler-rt/test/ubsan/CMakeLists.txt
@@ -43,6 +43,10 @@ endif()
 
 foreach(arch ${UBSAN_TEST_ARCH})
   set(UBSAN_TEST_TARGET_ARCH ${arch})
+  if (APPLE)
+    set(UBSAN_TEST_APPLE_PLATFORM "osx")
+    set(UBSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_osx_MIN_VER_FLAG}")
+  endif()
   get_test_cc_for_arch(${arch} UBSAN_TEST_TARGET_CC UBSAN_TEST_TARGET_CFLAGS)
   add_ubsan_testsuites("Standalone" ubsan ${arch})
 
@@ -73,8 +77,10 @@ macro(add_ubsan_device_testsuite test_mode sanitizer platform arch)
   set(UBSAN_TEST_USE_THINLTO "False")
   if (APPLE)
     set(UBSAN_TEST_APPLE_PLATFORM "${platform}")
+    set(UBSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG "${DARWIN_${platform}_MIN_VER_FLAG}")
   else()
     unset(UBSAN_TEST_APPLE_PLATFORM)
+    unset(UBSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG)
   endif()
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
diff --git a/compiler-rt/test/ubsan/lit.site.cfg.py.in b/compiler-rt/test/ubsan/lit.site.cfg.py.in
index 6a29917b86ee4..4dfd5c5b5c1a9 100644
--- a/compiler-rt/test/ubsan/lit.site.cfg.py.in
+++ b/compiler-rt/test/ubsan/lit.site.cfg.py.in
@@ -8,6 +8,7 @@ config.target_arch = "@UBSAN_TEST_TARGET_ARCH@"
 config.use_lld = @UBSAN_TEST_USE_LLD@
 config.use_thinlto = @UBSAN_TEST_USE_THINLTO@
 config.apple_platform = "@UBSAN_TEST_APPLE_PLATFORM@"
+config.apple_platform_min_deployment_target_flag = "@UBSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG@"
 
 # Load common config for all compiler-rt lit tests.
 lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
diff --git a/libcxx/include/cstddef b/libcxx/include/cstddef
index bd62d6db39e8e..87eee4bf5b424 100644
--- a/libcxx/include/cstddef
+++ b/libcxx/include/cstddef
@@ -57,6 +57,32 @@ using ::max_align_t;
 typedef long double max_align_t;
 #endif
 
+template <class _Tp> struct __libcpp_is_integral                     { enum { value = 0 }; };
+template <>          struct __libcpp_is_integral<bool>               { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<char>               { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<signed char>        { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<unsigned char>      { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<wchar_t>            { enum { value = 1 }; };
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+template <>          struct __libcpp_is_integral<char8_t>            { enum { value = 1 }; };
+#endif
+#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
+template <>          struct __libcpp_is_integral<char16_t>           { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<char32_t>           { enum { value = 1 }; };
+#endif  // _LIBCPP_HAS_NO_UNICODE_CHARS
+template <>          struct __libcpp_is_integral<short>              { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<unsigned short>     { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<int>                { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<unsigned int>       { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<long>               { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<unsigned long>      { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<long long>          { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<unsigned long long> { enum { value = 1 }; };
+#ifndef _LIBCPP_HAS_NO_INT128
+template <>          struct __libcpp_is_integral<__int128_t>         { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<__uint128_t>        { enum { value = 1 }; };
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER > 14
@@ -64,6 +90,11 @@ namespace std  // purposefully not versioned
 {
 enum class byte : unsigned char {};
 
+
+template <bool> struct __enable_if_integral_imp {};
+template <> struct __enable_if_integral_imp<true> { using type = byte; };
+template <class _Tp> using _EnableByteOverload = typename __enable_if_integral_imp<__libcpp_is_integral<_Tp>::value>::type;
+
 constexpr byte  operator| (byte  __lhs, byte __rhs) noexcept
 {
     return static_cast<byte>(
@@ -104,10 +135,31 @@ constexpr byte  operator~ (byte __b) noexcept
         ~static_cast<unsigned int>(__b)
     ));
 }
-
+template <class _Integer>
+  constexpr _EnableByteOverload<_Integer> &
+  operator<<=(byte& __lhs, _Integer __shift) noexcept
+  { return __lhs = __lhs << __shift; }
+
+template <class _Integer>
+  constexpr _EnableByteOverload<_Integer>
+  operator<< (byte  __lhs, _Integer __shift) noexcept
+  { return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) << __shift)); }
+
+template <class _Integer>
+  constexpr _EnableByteOverload<_Integer> &
+  operator>>=(byte& __lhs, _Integer __shift) noexcept
+  { return __lhs = __lhs >> __shift; }
+
+template <class _Integer>
+  constexpr _EnableByteOverload<_Integer>
+  operator>> (byte  __lhs, _Integer __shift) noexcept
+  { return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) >> __shift)); }
+
+template <class _Integer, class = _EnableByteOverload<_Integer> >
+  constexpr _Integer
+  to_integer(byte __b) noexcept { return static_cast<_Integer>(__b); }
 }
 
-#include <type_traits>  // rest of byte
 #endif
 
 #endif  // _LIBCPP_CSTDDEF
diff --git a/libcxx/include/span b/libcxx/include/span
index 82bcbff402b1e..1fe1496530e98 100644
--- a/libcxx/include/span
+++ b/libcxx/include/span
@@ -307,13 +307,13 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY constexpr reference front() const noexcept
     {
-        static_assert(_Extent > 0, "span<T,N>[].front() on empty span");
+        _LIBCPP_ASSERT(!empty(), "span<T, N>::front() on empty span");
         return __data[0];
     }
 
     _LIBCPP_INLINE_VISIBILITY constexpr reference back() const noexcept
     {
-        static_assert(_Extent > 0, "span<T,N>[].back() on empty span");
+        _LIBCPP_ASSERT(!empty(), "span<T, N>::back() on empty span");
         return __data[size()-1];
     }
 
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index f8ee5648d3581..6b8b855afc650 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -735,34 +735,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_null_pointer_v
 
 // is_integral
 
-template <class _Tp> struct __libcpp_is_integral                     : public false_type {};
-template <>          struct __libcpp_is_integral<bool>               : public true_type {};
-template <>          struct __libcpp_is_integral<char>               : public true_type {};
-template <>          struct __libcpp_is_integral<signed char>        : public true_type {};
-template <>          struct __libcpp_is_integral<unsigned char>      : public true_type {};
-template <>          struct __libcpp_is_integral<wchar_t>            : public true_type {};
-#ifndef _LIBCPP_NO_HAS_CHAR8_T
-template <>          struct __libcpp_is_integral<char8_t>            : public true_type {};
-#endif
-#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
-template <>          struct __libcpp_is_integral<char16_t>           : public true_type {};
-template <>          struct __libcpp_is_integral<char32_t>           : public true_type {};
-#endif  // _LIBCPP_HAS_NO_UNICODE_CHARS
-template <>          struct __libcpp_is_integral<short>              : public true_type {};
-template <>          struct __libcpp_is_integral<unsigned short>     : public true_type {};
-template <>          struct __libcpp_is_integral<int>                : public true_type {};
-template <>          struct __libcpp_is_integral<unsigned int>       : public true_type {};
-template <>          struct __libcpp_is_integral<long>               : public true_type {};
-template <>          struct __libcpp_is_integral<unsigned long>      : public true_type {};
-template <>          struct __libcpp_is_integral<long long>          : public true_type {};
-template <>          struct __libcpp_is_integral<unsigned long long> : public true_type {};
-#ifndef _LIBCPP_HAS_NO_INT128
-template <>          struct __libcpp_is_integral<__int128_t>         : public true_type {};
-template <>          struct __libcpp_is_integral<__uint128_t>        : public true_type {};
-#endif
-
 template <class _Tp> struct _LIBCPP_TEMPLATE_VIS is_integral
-    : public __libcpp_is_integral<typename remove_cv<_Tp>::type> {};
+    : public _BoolConstant<__libcpp_is_integral<typename remove_cv<_Tp>::type>::value> {};
 
 #if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES)
 template <class _Tp>
@@ -4046,29 +4020,7 @@ _LIBCPP_END_NAMESPACE_STD
 // std::byte
 namespace std  // purposefully not versioned
 {
-template <class _Integer>
-  constexpr typename enable_if<is_integral_v<_Integer>, byte>::type &
-  operator<<=(byte& __lhs, _Integer __shift) noexcept
-  { return __lhs = __lhs << __shift; }
-
-template <class _Integer>
-  constexpr typename enable_if<is_integral_v<_Integer>, byte>::type
-  operator<< (byte  __lhs, _Integer __shift) noexcept
-  { return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) << __shift)); }
-
-template <class _Integer>
-  constexpr typename enable_if<is_integral_v<_Integer>, byte>::type &
-  operator>>=(byte& __lhs, _Integer __shift) noexcept
-  { return __lhs = __lhs >> __shift; }
-
-template <class _Integer>
-  constexpr typename enable_if<is_integral_v<_Integer>, byte>::type
-  operator>> (byte  __lhs, _Integer __shift) noexcept
-  { return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) >> __shift)); }
-
-template <class _Integer>
-  constexpr typename enable_if<is_integral_v<_Integer>, _Integer>::type
-  to_integer(byte __b) noexcept { return static_cast<_Integer>(__b); }
+
 
 }
 #endif
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 27601769a83be..74813cc5016df 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -60,6 +60,7 @@ public:
 #include <exception>
 #include <cstddef>
 #include <cstdint>
+#include <type_traits>
 #ifdef _LIBCPP_NO_EXCEPTIONS
 #include <cstdlib>
 #endif
diff --git a/libcxx/test/std/containers/views/span.elem/back.pass.cpp b/libcxx/test/std/containers/views/span.elem/back.pass.cpp
index f2c0cf60dbe80..5bb9631aa90b1 100644
--- a/libcxx/test/std/containers/views/span.elem/back.pass.cpp
+++ b/libcxx/test/std/containers/views/span.elem/back.pass.cpp
@@ -30,7 +30,6 @@ constexpr bool testConstexprSpan(Span sp)
     return std::addressof(sp.back()) == sp.data() + sp.size() - 1;
 }
 
-
 template <typename Span>
 void testRuntimeSpan(Span sp)
 {
@@ -38,6 +37,12 @@ void testRuntimeSpan(Span sp)
     assert(std::addressof(sp.back()) == sp.data() + sp.size() - 1);
 }
 
+template <typename Span>
+void testEmptySpan(Span sp)
+{
+    if (!sp.empty())
+        [[maybe_unused]] auto res = sp.back();
+}
 
 struct A{};
 constexpr int iArr1[] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9};
@@ -71,5 +76,8 @@ int main(int, char**)
     testRuntimeSpan(std::span<std::string>   (&s, 1));
     testRuntimeSpan(std::span<std::string, 1>(&s, 1));
 
+    std::span<int, 0> sp;
+    testEmptySpan(sp);
+
     return 0;
 }
diff --git a/libcxx/test/std/containers/views/span.elem/front.pass.cpp b/libcxx/test/std/containers/views/span.elem/front.pass.cpp
index 7f18a2422b395..e17f7dd1576dd 100644
--- a/libcxx/test/std/containers/views/span.elem/front.pass.cpp
+++ b/libcxx/test/std/containers/views/span.elem/front.pass.cpp
@@ -38,6 +38,12 @@ void testRuntimeSpan(Span sp)
     assert(std::addressof(sp.front()) == sp.data());
 }
 
+template <typename Span>
+void testEmptySpan(Span sp)
+{
+    if (!sp.empty())
+        [[maybe_unused]] auto res = sp.front();
+}
 
 struct A{};
 constexpr int iArr1[] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9};
@@ -71,5 +77,8 @@ int main(int, char**)
     testRuntimeSpan(std::span<std::string>   (&s, 1));
     testRuntimeSpan(std::span<std::string, 1>(&s, 1));
 
+    std::span<int, 0> sp;
+    testEmptySpan(sp);
+
     return 0;
 }
diff --git a/libcxx/test/std/language.support/support.types/byteops/to_integer.pass.cpp b/libcxx/test/std/language.support/support.types/byteops/to_integer.pass.cpp
index 657d17d9c4516..ef1779e1b45fe 100644
--- a/libcxx/test/std/language.support/support.types/byteops/to_integer.pass.cpp
+++ b/libcxx/test/std/language.support/support.types/byteops/to_integer.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <cstddef>
+#include <type_traits>
 #include <test_macros.h>
 
 // UNSUPPORTED: c++98, c++03, c++11, c++14
diff --git a/libcxx/test/std/numerics/c.math/abs.pass.cpp b/libcxx/test/std/numerics/c.math/abs.pass.cpp
index 3993dd34318a7..03aae465c2573 100644
--- a/libcxx/test/std/numerics/c.math/abs.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/abs.pass.cpp
@@ -47,7 +47,7 @@ int main(int, char**)
 {
     // On some systems char is unsigned.
     // If that is the case, we should just test signed char twice.
-    typedef typename std::conditional<
+    typedef std::conditional<
         std::is_signed<char>::value, char, signed char
     >::type SignedChar;
 
@@ -63,10 +63,10 @@ int main(int, char**)
 
     // Here there is no guarantee that int is larger than int8_t so we
     // use a helper type trait to conditional test against int.
-    test_abs<std::int8_t, typename correct_size_int<std::int8_t>::type>();
-    test_abs<std::int16_t, typename correct_size_int<std::int16_t>::type>();
-    test_abs<std::int32_t, typename correct_size_int<std::int32_t>::type>();
-    test_abs<std::int64_t, typename correct_size_int<std::int64_t>::type>();
+    test_abs<std::int8_t, correct_size_int<std::int8_t>::type>();
+    test_abs<std::int16_t, correct_size_int<std::int16_t>::type>();
+    test_abs<std::int32_t, correct_size_int<std::int32_t>::type>();
+    test_abs<std::int64_t, correct_size_int<std::int64_t>::type>();
 
     test_abs<long double, long double>();
     test_abs<double, double>();
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index bd01bc02617c8..3a0251c90692f 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -2747,8 +2747,8 @@ createSymbols(ArrayRef<std::vector<GdbIndexSection::NameAttrEntry>> nameAttrs,
   size_t numShards = 32;
   size_t concurrency = 1;
   if (threadsEnabled)
-    concurrency =
-        std::min<size_t>(PowerOf2Floor(hardware_concurrency()), numShards);
+    concurrency = std::min<size_t>(
+        hardware_concurrency().compute_thread_count(), numShards);
 
   // A sharded map to uniquify symbols by name.
   std::vector<DenseMap<CachedHashStringRef, size_t>> map(numShards);
@@ -3191,8 +3191,8 @@ void MergeNoTailSection::finalizeContents() {
   // operations in the following tight loop.
   size_t concurrency = 1;
   if (threadsEnabled)
-    concurrency =
-        std::min<size_t>(PowerOf2Floor(hardware_concurrency()), numShards);
+    concurrency = std::min<size_t>(
+        hardware_concurrency().compute_thread_count(), numShards);
 
   // Add section pieces to the builders.
   parallelForEachN(0, concurrency, [&](size_t threadId) {
diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index af4507bf3496b..1544ecc386c7b 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -22,14 +22,14 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#define LLDB_PLUGIN(PluginName)                                                \
+#define LLDB_PLUGIN_DEFINE(PluginName)                                         \
   namespace lldb_private {                                                     \
   void lldb_initialize_##PluginName() { PluginName::Initialize(); }            \
   void lldb_terminate_##PluginName() { PluginName::Terminate(); }              \
   }
 
 // FIXME: Generate me with CMake
-#define LLDB_PLUGIN_DECLARE(PluginName)                                         \
+#define LLDB_PLUGIN_DECLARE(PluginName)                                        \
   namespace lldb_private {                                                     \
   extern void lldb_initialize_##PluginName();                                  \
   extern void lldb_terminate_##PluginName();                                   \
diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index 6d82a9826b393..f73255d322409 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -99,18 +99,24 @@ bool IsOnlySpaces(const EditLineStringType &content) {
 
 static int GetOperation(HistoryOperation op) {
   // The naming used by editline for the history operations is counter
-  // intuitive to how it's used here.
+  // intuitive to how it's used in LLDB's editline implementation.
+  //
+  //  - The H_LAST returns the oldest entry in the history.
   //
   //  - The H_PREV operation returns the previous element in the history, which
   //    is newer than the current one.
   //
+  //  - The H_CURR returns the current entry in the history.
+  //
   //  - The H_NEXT operation returns the next element in the history, which is
   //    older than the current one.
   //
+  //  - The H_FIRST returns the most recent entry in the history.
+  //
   // The naming of the enum entries match the semantic meaning.
   switch(op) {
     case HistoryOperation::Oldest:
-      return H_FIRST;
+      return H_LAST;
     case HistoryOperation::Older:
       return H_NEXT;
     case HistoryOperation::Current:
@@ -118,7 +124,7 @@ static int GetOperation(HistoryOperation op) {
     case HistoryOperation::Newer:
       return H_PREV;
     case HistoryOperation::Newest:
-      return H_LAST;
+      return H_FIRST;
   }
   llvm_unreachable("Fully covered switch!");
 }
diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
index f37bc1d235897..43cc4c3cd87bf 100644
--- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
+++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
@@ -11,7 +11,7 @@
 #include "ABISysV_arm64.h"
 #include "lldb/Core/PluginManager.h"
 
-LLDB_PLUGIN(ABIAArch64)
+LLDB_PLUGIN_DEFINE(ABIAArch64)
 
 void ABIAArch64::Initialize() {
   ABISysV_arm64::Initialize();
diff --git a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp
index 7726c1b891de0..1690f1c511f2a 100644
--- a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp
+++ b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp
@@ -55,7 +55,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABISysV_arc)
+LLDB_PLUGIN_DEFINE(ABISysV_arc)
 
 namespace {
 namespace dwarf {
diff --git a/lldb/source/Plugins/ABI/ARM/ABIARM.cpp b/lldb/source/Plugins/ABI/ARM/ABIARM.cpp
index 790cb877b91e2..882c14d386e31 100644
--- a/lldb/source/Plugins/ABI/ARM/ABIARM.cpp
+++ b/lldb/source/Plugins/ABI/ARM/ABIARM.cpp
@@ -11,7 +11,7 @@
 #include "ABISysV_arm.h"
 #include "lldb/Core/PluginManager.h"
 
-LLDB_PLUGIN(ABIARM)
+LLDB_PLUGIN_DEFINE(ABIARM)
 
 void ABIARM::Initialize() {
   ABISysV_arm::Initialize();
diff --git a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp
index 4d38b9165728f..73d8308ae0dc4 100644
--- a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp
+++ b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp
@@ -34,7 +34,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABIMacOSX_arm)
+LLDB_PLUGIN_DEFINE(ABIMacOSX_arm)
 
 static RegisterInfo g_register_infos[] = {
     //  NAME       ALT       SZ OFF ENCODING         FORMAT          EH_FRAME
diff --git a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp
index 8d7867827f602..1a93bac564f72 100644
--- a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp
+++ b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp
@@ -34,7 +34,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABISysV_arm)
+LLDB_PLUGIN_DEFINE(ABISysV_arm)
 
 static RegisterInfo g_register_infos[] = {
     //  NAME       ALT       SZ OFF ENCODING         FORMAT          EH_FRAME
diff --git a/lldb/source/Plugins/ABI/Hexagon/ABISysV_hexagon.cpp b/lldb/source/Plugins/ABI/Hexagon/ABISysV_hexagon.cpp
index 65407bfe2543a..601d9c2f0f052 100644
--- a/lldb/source/Plugins/ABI/Hexagon/ABISysV_hexagon.cpp
+++ b/lldb/source/Plugins/ABI/Hexagon/ABISysV_hexagon.cpp
@@ -32,7 +32,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABISysV_hexagon)
+LLDB_PLUGIN_DEFINE(ABISysV_hexagon)
 
 static RegisterInfo g_register_infos[] = {
     // hexagon-core.xml
diff --git a/lldb/source/Plugins/ABI/Mips/ABIMips.cpp b/lldb/source/Plugins/ABI/Mips/ABIMips.cpp
index 08e694a659b41..16ef1faf9d9d6 100644
--- a/lldb/source/Plugins/ABI/Mips/ABIMips.cpp
+++ b/lldb/source/Plugins/ABI/Mips/ABIMips.cpp
@@ -11,7 +11,7 @@
 #include "ABISysV_mips64.h"
 #include "lldb/Core/PluginManager.h"
 
-LLDB_PLUGIN(ABIMips)
+LLDB_PLUGIN_DEFINE(ABIMips)
 
 void ABIMips::Initialize() {
   ABISysV_mips::Initialize();
diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp
index 401646a334f1d..d66e0926ad99e 100644
--- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp
+++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp
@@ -32,7 +32,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABISysV_mips)
+LLDB_PLUGIN_DEFINE(ABISysV_mips)
 
 enum dwarf_regnums {
   dwarf_r0 = 0,
diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp
index ea42f0c8fe17c..bb28a50e5f4ab 100644
--- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp
+++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp
@@ -32,7 +32,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABISysV_mips64)
+LLDB_PLUGIN_DEFINE(ABISysV_mips64)
 
 enum dwarf_regnums {
   dwarf_r0 = 0,
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABIPowerPC.cpp b/lldb/source/Plugins/ABI/PowerPC/ABIPowerPC.cpp
index b1591dba6a1bb..b561e3c93f571 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABIPowerPC.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABIPowerPC.cpp
@@ -11,7 +11,7 @@
 #include "ABISysV_ppc64.h"
 #include "lldb/Core/PluginManager.h"
 
-LLDB_PLUGIN(ABIPowerPC)
+LLDB_PLUGIN_DEFINE(ABIPowerPC)
 
 void ABIPowerPC::Initialize() {
   ABISysV_ppc::Initialize();
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp
index c4d90a69a0320..6f5eded7b0315 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp
@@ -32,7 +32,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABISysV_ppc)
+LLDB_PLUGIN_DEFINE(ABISysV_ppc)
 
 enum dwarf_regnums {
   dwarf_r0 = 0,
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
index dba347d3ceafc..251ac972fd768 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
@@ -47,7 +47,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABISysV_ppc64)
+LLDB_PLUGIN_DEFINE(ABISysV_ppc64)
 
 const lldb_private::RegisterInfo *
 ABISysV_ppc64::GetRegisterInfoArray(uint32_t &count) {
diff --git a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp
index c01e088cd7996..bfeaa1226df26 100644
--- a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp
+++ b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp
@@ -32,7 +32,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABISysV_s390x)
+LLDB_PLUGIN_DEFINE(ABISysV_s390x)
 
 enum dwarf_regnums {
   // General Purpose Registers
diff --git a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp
index 610baa2ca0869..d11c1af1d2599 100644
--- a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp
@@ -29,7 +29,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABIMacOSX_i386)
+LLDB_PLUGIN_DEFINE(ABIMacOSX_i386)
 
 enum {
   ehframe_eax = 0,
diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_i386.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_i386.cpp
index a89e0baad1a2d..8fc22b21623cd 100644
--- a/lldb/source/Plugins/ABI/X86/ABISysV_i386.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABISysV_i386.cpp
@@ -31,7 +31,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABISysV_i386)
+LLDB_PLUGIN_DEFINE(ABISysV_i386)
 
 //   This source file uses the following document as a reference:
 //====================================================================
diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
index 571b796652eec..01671190e106f 100644
--- a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
@@ -35,7 +35,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABISysV_x86_64)
+LLDB_PLUGIN_DEFINE(ABISysV_x86_64)
 
 enum dwarf_regnums {
   dwarf_rax = 0,
diff --git a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
index 6a7c98323037f..37b1aedcd463c 100644
--- a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
@@ -33,7 +33,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ABIWindows_x86_64)
+LLDB_PLUGIN_DEFINE(ABIWindows_x86_64)
 
 enum dwarf_regnums {
   dwarf_rax = 0,
diff --git a/lldb/source/Plugins/ABI/X86/ABIX86.cpp b/lldb/source/Plugins/ABI/X86/ABIX86.cpp
index 207d0b289d67f..714bf25f482ce 100644
--- a/lldb/source/Plugins/ABI/X86/ABIX86.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABIX86.cpp
@@ -13,7 +13,7 @@
 #include "ABIWindows_x86_64.h"
 #include "lldb/Core/PluginManager.h"
 
-LLDB_PLUGIN(ABIX86)
+LLDB_PLUGIN_DEFINE(ABIX86)
 
 void ABIX86::Initialize() {
   ABIMacOSX_i386::Initialize();
diff --git a/lldb/source/Plugins/Architecture/Arm/ArchitectureArm.cpp b/lldb/source/Plugins/Architecture/Arm/ArchitectureArm.cpp
index 7fb9281fb7875..58c7cbb4530ad 100644
--- a/lldb/source/Plugins/Architecture/Arm/ArchitectureArm.cpp
+++ b/lldb/source/Plugins/Architecture/Arm/ArchitectureArm.cpp
@@ -17,7 +17,7 @@
 using namespace lldb_private;
 using namespace lldb;
 
-LLDB_PLUGIN(ArchitectureArm)
+LLDB_PLUGIN_DEFINE(ArchitectureArm)
 
 ConstString ArchitectureArm::GetPluginNameStatic() {
   return ConstString("arm");
diff --git a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp
index e8240ce0d725b..f426ac63e4b53 100644
--- a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp
+++ b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp
@@ -21,7 +21,7 @@
 using namespace lldb_private;
 using namespace lldb;
 
-LLDB_PLUGIN(ArchitectureMips)
+LLDB_PLUGIN_DEFINE(ArchitectureMips)
 
 ConstString ArchitectureMips::GetPluginNameStatic() {
   return ConstString("mips");
diff --git a/lldb/source/Plugins/Architecture/PPC64/ArchitecturePPC64.cpp b/lldb/source/Plugins/Architecture/PPC64/ArchitecturePPC64.cpp
index 83d6832381e73..94301ecf052c1 100644
--- a/lldb/source/Plugins/Architecture/PPC64/ArchitecturePPC64.cpp
+++ b/lldb/source/Plugins/Architecture/PPC64/ArchitecturePPC64.cpp
@@ -20,7 +20,7 @@
 using namespace lldb_private;
 using namespace lldb;
 
-LLDB_PLUGIN(ArchitecturePPC64)
+LLDB_PLUGIN_DEFINE(ArchitecturePPC64)
 
 ConstString ArchitecturePPC64::GetPluginNameStatic() {
   return ConstString("ppc64");
diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
index 139bda59a60c0..6427d8d176c86 100644
--- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
+++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
@@ -43,7 +43,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(DisassemblerLLVMC)
+LLDB_PLUGIN_DEFINE(DisassemblerLLVMC)
 
 class DisassemblerLLVMC::MCDisasmInstance {
 public:
diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
index 26f85906e3173..193b3bd829c54 100644
--- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
+++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
@@ -44,7 +44,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(DynamicLoaderDarwinKernel)
+LLDB_PLUGIN_DEFINE(DynamicLoaderDarwinKernel)
 
 // Progressively greater amounts of scanning we will allow For some targets
 // very early in startup, we can't do any random reads of memory or we can
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
index 9cb6d1fcb612d..a6db648baa1af 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
@@ -47,7 +47,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(DynamicLoaderMacOSXDYLD)
+LLDB_PLUGIN_DEFINE(DynamicLoaderMacOSXDYLD)
 
 // Create an instance of this class. This function is filled into the plugin
 // info class that gets handed out by the plugin factory and allows the lldb to
diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
index eed8a487d258b..c572c3024f9ce 100644
--- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
@@ -29,7 +29,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(DynamicLoaderPOSIXDYLD)
+LLDB_PLUGIN_DEFINE(DynamicLoaderPOSIXDYLD)
 
 void DynamicLoaderPOSIXDYLD::Initialize() {
   PluginManager::RegisterPlugin(GetPluginNameStatic(),
diff --git a/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp b/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp
index 651d233cd025b..13aad5f4ccb66 100644
--- a/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp
+++ b/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp
@@ -17,7 +17,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(DynamicLoaderStatic)
+LLDB_PLUGIN_DEFINE(DynamicLoaderStatic)
 
 // Create an instance of this class. This function is filled into the plugin
 // info class that gets handed out by the plugin factory and allows the lldb to
diff --git a/lldb/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp b/lldb/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp
index 442eae7d8d09a..e4eceb2bd63c8 100644
--- a/lldb/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp
@@ -23,7 +23,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(DynamicLoaderWindowsDYLD)
+LLDB_PLUGIN_DEFINE(DynamicLoaderWindowsDYLD)
 
 DynamicLoaderWindowsDYLD::DynamicLoaderWindowsDYLD(Process *process)
     : DynamicLoader(process) {}
diff --git a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
index 62d69953fe765..e87bc1f75f5ce 100644
--- a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
+++ b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
@@ -30,7 +30,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(EmulateInstructionARM)
+LLDB_PLUGIN_DEFINE(EmulateInstructionARM)
 
 // Convenient macro definitions.
 #define APSR_C Bit32(m_opcode_cpsr, CPSR_C_POS)
diff --git a/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp b/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp
index 01cd03de60ab9..144d383732470 100644
--- a/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp
+++ b/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp
@@ -47,7 +47,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(EmulateInstructionARM64)
+LLDB_PLUGIN_DEFINE(EmulateInstructionARM64)
 
 static bool LLDBTableGetRegisterInfo(uint32_t reg_num, RegisterInfo &reg_info) {
   if (reg_num >= llvm::array_lengthof(g_register_infos_arm64_le))
diff --git a/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp b/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp
index f4a947599f518..ae74c89c4f2eb 100644
--- a/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp
+++ b/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp
@@ -40,7 +40,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(EmulateInstructionMIPS)
+LLDB_PLUGIN_DEFINE(EmulateInstructionMIPS)
 
 #define UInt(x) ((uint64_t)x)
 #define integer int64_t
diff --git a/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp b/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp
index 711e6d594eb63..9a578ab408f74 100644
--- a/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp
+++ b/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp
@@ -40,7 +40,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(EmulateInstructionMIPS64)
+LLDB_PLUGIN_DEFINE(EmulateInstructionMIPS64)
 
 #define UInt(x) ((uint64_t)x)
 #define integer int64_t
diff --git a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp
index 52175ef5f4ae7..2588c935dd6b7 100644
--- a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp
+++ b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp
@@ -25,7 +25,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(EmulateInstructionPPC64)
+LLDB_PLUGIN_DEFINE(EmulateInstructionPPC64)
 
 EmulateInstructionPPC64::EmulateInstructionPPC64(const ArchSpec &arch)
     : EmulateInstruction(arch) {}
diff --git a/lldb/source/Plugins/InstrumentationRuntime/ASan/InstrumentationRuntimeASan.cpp b/lldb/source/Plugins/InstrumentationRuntime/ASan/InstrumentationRuntimeASan.cpp
index dc23b604722d6..e78ea3a684836 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/ASan/InstrumentationRuntimeASan.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/ASan/InstrumentationRuntimeASan.cpp
@@ -30,7 +30,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(InstrumentationRuntimeASan)
+LLDB_PLUGIN_DEFINE(InstrumentationRuntimeASan)
 
 lldb::InstrumentationRuntimeSP
 InstrumentationRuntimeASan::CreateInstance(const lldb::ProcessSP &process_sp) {
diff --git a/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp b/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp
index 91c411a4f013a..72d28c3474576 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp
@@ -29,7 +29,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(InstrumentationRuntimeMainThreadChecker)
+LLDB_PLUGIN_DEFINE(InstrumentationRuntimeMainThreadChecker)
 
 InstrumentationRuntimeMainThreadChecker::
     ~InstrumentationRuntimeMainThreadChecker() {
diff --git a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp
index 4229626077345..f4c116e7576c4 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp
@@ -35,7 +35,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(InstrumentationRuntimeTSan)
+LLDB_PLUGIN_DEFINE(InstrumentationRuntimeTSan)
 
 lldb::InstrumentationRuntimeSP
 InstrumentationRuntimeTSan::CreateInstance(const lldb::ProcessSP &process_sp) {
diff --git a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp
index b13eac6081462..b60eb53f3d4a7 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp
@@ -36,7 +36,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(InstrumentationRuntimeUBSan)
+LLDB_PLUGIN_DEFINE(InstrumentationRuntimeUBSan)
 
 InstrumentationRuntimeUBSan::~InstrumentationRuntimeUBSan() { Deactivate(); }
 
diff --git a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
index ad089ad0d2295..df9f700a7f185 100644
--- a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
+++ b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
@@ -32,7 +32,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(JITLoaderGDB)
+LLDB_PLUGIN_DEFINE(JITLoaderGDB)
 
 // Debug Interface Structures
 enum jit_actions_t { JIT_NOACTION = 0, JIT_REGISTER_FN, JIT_UNREGISTER_FN };
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index ea2c0104cf2a1..97084da5fffad 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -43,7 +43,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::formatters;
 
-LLDB_PLUGIN(CPlusPlusLanguage)
+LLDB_PLUGIN_DEFINE(CPlusPlusLanguage)
 
 void CPlusPlusLanguage::Initialize() {
   PluginManager::RegisterPlugin(GetPluginNameStatic(), "C++ Language",
diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
index 82fe9b39b81f2..6b2a5f845d734 100644
--- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
+++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
@@ -37,7 +37,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::formatters;
 
-LLDB_PLUGIN(ObjCLanguage)
+LLDB_PLUGIN_DEFINE(ObjCLanguage)
 
 void ObjCLanguage::Initialize() {
   PluginManager::RegisterPlugin(GetPluginNameStatic(), "Objective-C Language",
diff --git a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp
index 207cec1a01f91..0a4017eda434c 100644
--- a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp
@@ -14,7 +14,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ObjCPlusPlusLanguage)
+LLDB_PLUGIN_DEFINE(ObjCPlusPlusLanguage)
 
 bool ObjCPlusPlusLanguage::IsSourceFile(llvm::StringRef file_path) const {
   const auto suffixes = {".h", ".mm"};
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
index 35418e0c2ffdc..e08f0f070f6c0 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
@@ -40,7 +40,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ItaniumABILanguageRuntime)
+LLDB_PLUGIN_DEFINE(ItaniumABILanguageRuntime)
 
 static const char *vtable_demangled_prefix = "vtable for ";
 
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp
index 3d58f41235eaa..cca6911485a04 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp
@@ -44,7 +44,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(AppleObjCRuntime)
+LLDB_PLUGIN_DEFINE(AppleObjCRuntime)
 
 char AppleObjCRuntime::ID = 0;
 
diff --git a/lldb/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp
index 9b81ba03148cb..f2b95028f807a 100644
--- a/lldb/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp
@@ -46,7 +46,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_renderscript;
 
-LLDB_PLUGIN(RenderScriptRuntime)
+LLDB_PLUGIN_DEFINE(RenderScriptRuntime)
 
 #define FMT_COORD "(%" PRIu32 ", %" PRIu32 ", %" PRIu32 ")"
 
diff --git a/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp b/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp
index 0c8250c5de895..4b9da8f76fd24 100644
--- a/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp
+++ b/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp
@@ -28,7 +28,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(MemoryHistoryASan)
+LLDB_PLUGIN_DEFINE(MemoryHistoryASan)
 
 MemoryHistorySP MemoryHistoryASan::CreateInstance(const ProcessSP &process_sp) {
   if (!process_sp.get())
diff --git a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
index 3d4885379e86d..47c7ae8c8d639 100644
--- a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
+++ b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
@@ -40,7 +40,7 @@ typedef struct ar_hdr {
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ObjectContainerBSDArchive)
+LLDB_PLUGIN_DEFINE(ObjectContainerBSDArchive)
 
 ObjectContainerBSDArchive::Object::Object()
     : ar_name(), modification_time(0), uid(0), gid(0), mode(0), size(0),
diff --git a/lldb/source/Plugins/ObjectContainer/Universal-Mach-O/ObjectContainerUniversalMachO.cpp b/lldb/source/Plugins/ObjectContainer/Universal-Mach-O/ObjectContainerUniversalMachO.cpp
index ef763addede4f..bc30e57d1d0cd 100644
--- a/lldb/source/Plugins/ObjectContainer/Universal-Mach-O/ObjectContainerUniversalMachO.cpp
+++ b/lldb/source/Plugins/ObjectContainer/Universal-Mach-O/ObjectContainerUniversalMachO.cpp
@@ -20,7 +20,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace llvm::MachO;
 
-LLDB_PLUGIN(ObjectContainerUniversalMachO)
+LLDB_PLUGIN_DEFINE(ObjectContainerUniversalMachO)
 
 void ObjectContainerUniversalMachO::Initialize() {
   PluginManager::RegisterPlugin(GetPluginNameStatic(),
diff --git a/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp b/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp
index f36305be5960e..7a9163ddb8801 100644
--- a/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp
+++ b/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp
@@ -16,7 +16,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::breakpad;
 
-LLDB_PLUGIN(ObjectFileBreakpad)
+LLDB_PLUGIN_DEFINE(ObjectFileBreakpad)
 
 namespace {
 struct Header {
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index b97a326fde3ed..a328e16e4bde5 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -51,7 +51,7 @@ using namespace lldb_private;
 using namespace elf;
 using namespace llvm::ELF;
 
-LLDB_PLUGIN(ObjectFileELF)
+LLDB_PLUGIN_DEFINE(ObjectFileELF)
 
 namespace {
 
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index ce7d293f205d7..afa9b645cbecc 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -66,7 +66,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace llvm::MachO;
 
-LLDB_PLUGIN(ObjectFileMachO)
+LLDB_PLUGIN_DEFINE(ObjectFileMachO)
 
 // Some structure definitions needed for parsing the dyld shared cache files
 // found on iOS devices.
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index ec11fdd0e3750..38b4472f50a75 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -41,7 +41,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ObjectFilePECOFF)
+LLDB_PLUGIN_DEFINE(ObjectFilePECOFF)
 
 struct CVInfoPdb70 {
   // 16-byte GUID
diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
index 270626061737f..b9561bdff9f3e 100644
--- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
+++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
@@ -28,7 +28,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::wasm;
 
-LLDB_PLUGIN(ObjectFileWasm)
+LLDB_PLUGIN_DEFINE(ObjectFileWasm)
 
 static const uint32_t kWasmHeaderSize =
     sizeof(llvm::wasm::WasmMagic) + sizeof(llvm::wasm::WasmVersion);
diff --git a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
index da65e9f54a73f..417aa2e21436f 100644
--- a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
+++ b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
@@ -39,7 +39,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(OperatingSystemPython)
+LLDB_PLUGIN_DEFINE(OperatingSystemPython)
 
 void OperatingSystemPython::Initialize() {
   PluginManager::RegisterPlugin(GetPluginNameStatic(),
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
index 6ce0858787a1e..9949fbf18fa33 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
@@ -26,7 +26,7 @@ using namespace lldb_private;
 using namespace lldb_private::platform_android;
 using namespace std::chrono;
 
-LLDB_PLUGIN(PlatformAndroid)
+LLDB_PLUGIN_DEFINE(PlatformAndroid)
 
 static uint32_t g_initialize_count = 0;
 static const unsigned int g_android_default_cache_size =
diff --git a/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp b/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp
index e7a3ea2e6ec7c..97c2f22b505f5 100644
--- a/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp
+++ b/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp
@@ -36,7 +36,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::platform_freebsd;
 
-LLDB_PLUGIN(PlatformFreeBSD)
+LLDB_PLUGIN_DEFINE(PlatformFreeBSD)
 
 static uint32_t g_initialize_count = 0;
 
diff --git a/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp b/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp
index 66c3529253817..cea87c4d90ad1 100644
--- a/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp
+++ b/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp
@@ -34,7 +34,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::platform_linux;
 
-LLDB_PLUGIN(PlatformLinux)
+LLDB_PLUGIN_DEFINE(PlatformLinux)
 
 static uint32_t g_initialize_count = 0;
 
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
index c62940f35e5c7..38de91a30cf65 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
@@ -39,7 +39,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(PlatformMacOSX)
+LLDB_PLUGIN_DEFINE(PlatformMacOSX)
 
 static uint32_t g_initialize_count = 0;
 
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp
index 567c64ca5519f..b37cdecd38c4d 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp
@@ -25,7 +25,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(PlatformRemoteiOS)
+LLDB_PLUGIN_DEFINE(PlatformRemoteiOS)
 
 // Static Variables
 static uint32_t g_initialize_count = 0;
diff --git a/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp b/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp
index 1c7980151f61b..9942c339650ee 100644
--- a/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp
+++ b/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp
@@ -34,7 +34,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::platform_netbsd;
 
-LLDB_PLUGIN(PlatformNetBSD)
+LLDB_PLUGIN_DEFINE(PlatformNetBSD)
 
 static uint32_t g_initialize_count = 0;
 
diff --git a/lldb/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp b/lldb/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp
index 36f5e1692db28..a743970990a64 100644
--- a/lldb/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp
+++ b/lldb/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp
@@ -34,7 +34,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::platform_openbsd;
 
-LLDB_PLUGIN(PlatformOpenBSD)
+LLDB_PLUGIN_DEFINE(PlatformOpenBSD)
 
 static uint32_t g_initialize_count = 0;
 
diff --git a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
index 01250b34ddc98..7983c1d461b6c 100644
--- a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
+++ b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
@@ -27,7 +27,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(PlatformWindows)
+LLDB_PLUGIN_DEFINE(PlatformWindows)
 
 static uint32_t g_initialize_count = 0;
 
diff --git a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
index b581c0783cf85..550b68090e7a8 100644
--- a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
+++ b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
@@ -35,7 +35,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::platform_gdb_server;
 
-LLDB_PLUGIN(PlatformRemoteGDBServer)
+LLDB_PLUGIN_DEFINE(PlatformRemoteGDBServer)
 
 static bool g_initialized = false;
 
diff --git a/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp
index a11959aa5a2f6..7228ec987ad1e 100644
--- a/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp
@@ -56,7 +56,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ProcessFreeBSD)
+LLDB_PLUGIN_DEFINE(ProcessFreeBSD)
 
 namespace {
 UnixSignalsSP &GetFreeBSDSignals() {
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index c0f31d76d018c..e78912e3cd305 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -50,7 +50,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ProcessKDP)
+LLDB_PLUGIN_DEFINE(ProcessKDP)
 
 namespace {
 
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index a35e6c08c63da..286a95fa58947 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -44,7 +44,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ProcessWindows)
+LLDB_PLUGIN_DEFINE(ProcessWindows)
 
 namespace {
 std::string GetProcessExecutableName(HANDLE process_handle) {
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
index 4056800c9de3f..aa95e92607ad7 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
@@ -35,7 +35,7 @@
 using namespace lldb_private;
 namespace ELF = llvm::ELF;
 
-LLDB_PLUGIN(ProcessElfCore)
+LLDB_PLUGIN_DEFINE(ProcessElfCore)
 
 ConstString ProcessElfCore::GetPluginNameStatic() {
   static ConstString g_name("elf-core");
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 3296560658229..c7fc0161d53aa 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -90,7 +90,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::process_gdb_remote;
 
-LLDB_PLUGIN(ProcessGDBRemote)
+LLDB_PLUGIN_DEFINE(ProcessGDBRemote)
 
 namespace lldb {
 // Provide a function that can easily dump the packet history if we know a
diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
index 3e6ba49bf0dd8..1628dc545c9b7 100644
--- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
+++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
@@ -44,7 +44,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ProcessMachCore)
+LLDB_PLUGIN_DEFINE(ProcessMachCore)
 
 ConstString ProcessMachCore::GetPluginNameStatic() {
   static ConstString g_name("mach-o-core");
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index dcbeb3bf81716..0ce3b580c1f5c 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -41,7 +41,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace minidump;
 
-LLDB_PLUGIN(ProcessMinidump)
+LLDB_PLUGIN_DEFINE(ProcessMinidump)
 
 namespace {
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp
index 629f209b91fd8..ecbd30c10ae01 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp
@@ -19,7 +19,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ScriptInterpreterLua)
+LLDB_PLUGIN_DEFINE(ScriptInterpreterLua)
 
 class IOHandlerLuaInterpreter : public IOHandlerDelegate,
                                 public IOHandlerEditline {
diff --git a/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp b/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp
index 23ff6b159633a..d9c32cc132d4c 100644
--- a/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp
@@ -20,7 +20,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(ScriptInterpreterNone)
+LLDB_PLUGIN_DEFINE(ScriptInterpreterNone)
 
 ScriptInterpreterNone::ScriptInterpreterNone(Debugger &debugger)
     : ScriptInterpreter(debugger, eScriptLanguageNone) {}
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index cc03627de901b..722af713ba437 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -56,7 +56,7 @@ using namespace lldb_private;
 using namespace lldb_private::python;
 using llvm::Expected;
 
-LLDB_PLUGIN(ScriptInterpreterPython)
+LLDB_PLUGIN_DEFINE(ScriptInterpreterPython)
 
 // Defined in the SWIG source file
 #if PY_MAJOR_VERSION >= 3
diff --git a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
index 8f1a2b57bc611..e61d9630656dd 100644
--- a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
+++ b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
@@ -36,7 +36,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(StructuredDataDarwinLog)
+LLDB_PLUGIN_DEFINE(StructuredDataDarwinLog)
 
 #pragma mark -
 #pragma mark Anonymous Namespace
diff --git a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
index d97ad7cbe451e..fcefb2e059b2b 100644
--- a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
+++ b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
@@ -25,7 +25,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::breakpad;
 
-LLDB_PLUGIN(SymbolFileBreakpad)
+LLDB_PLUGIN_DEFINE(SymbolFileBreakpad)
 
 char SymbolFileBreakpad::ID;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index b45d84870ffbf..2ed050cc193f9 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -94,7 +94,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(SymbolFileDWARF)
+LLDB_PLUGIN_DEFINE(SymbolFileDWARF)
 
 char SymbolFileDWARF::ID;
 
diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
index 51459a99576de..75f2eb1594214 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
@@ -58,7 +58,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace llvm::pdb;
 
-LLDB_PLUGIN(SymbolFilePDB)
+LLDB_PLUGIN_DEFINE(SymbolFilePDB)
 
 char SymbolFilePDB::ID;
 
diff --git a/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp b/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp
index 42b843694a6f8..c4a0e609aa22e 100644
--- a/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp
+++ b/lldb/source/Plugins/SymbolFile/Symtab/SymbolFileSymtab.cpp
@@ -25,7 +25,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(SymbolFileSymtab)
+LLDB_PLUGIN_DEFINE(SymbolFileSymtab)
 
 char SymbolFileSymtab::ID;
 
diff --git a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
index 21242a0c8b950..2e6fd43650212 100644
--- a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
+++ b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
@@ -25,7 +25,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(SymbolVendorELF)
+LLDB_PLUGIN_DEFINE(SymbolVendorELF)
 
 // SymbolVendorELF constructor
 SymbolVendorELF::SymbolVendorELF(const lldb::ModuleSP &module_sp)
diff --git a/lldb/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp b/lldb/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp
index 71a1025bef259..2b67fee706178 100644
--- a/lldb/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp
+++ b/lldb/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp
@@ -26,7 +26,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(SymbolVendorMacOSX)
+LLDB_PLUGIN_DEFINE(SymbolVendorMacOSX)
 
 // SymbolVendorMacOSX constructor
 SymbolVendorMacOSX::SymbolVendorMacOSX(const lldb::ModuleSP &module_sp)
diff --git a/lldb/source/Plugins/SymbolVendor/wasm/SymbolVendorWasm.cpp b/lldb/source/Plugins/SymbolVendor/wasm/SymbolVendorWasm.cpp
index 64dd956fd35fd..1c09dabc5622f 100644
--- a/lldb/source/Plugins/SymbolVendor/wasm/SymbolVendorWasm.cpp
+++ b/lldb/source/Plugins/SymbolVendor/wasm/SymbolVendorWasm.cpp
@@ -26,7 +26,7 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::wasm;
 
-LLDB_PLUGIN(SymbolVendorWasm)
+LLDB_PLUGIN_DEFINE(SymbolVendorWasm)
 
 // SymbolVendorWasm constructor
 SymbolVendorWasm::SymbolVendorWasm(const lldb::ModuleSP &module_sp)
diff --git a/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp b/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp
index 7ac70331267e0..25db5fe892fb6 100644
--- a/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp
+++ b/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp
@@ -34,7 +34,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(SystemRuntimeMacOSX)
+LLDB_PLUGIN_DEFINE(SystemRuntimeMacOSX)
 
 // Create an instance of this class. This function is filled into the plugin
 // info class that gets handed out by the plugin factory and allows the lldb to
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index cbe0301fe162e..2fa5dc38eb8eb 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -81,7 +81,7 @@ using namespace lldb_private;
 using namespace clang;
 using llvm::StringSwitch;
 
-LLDB_PLUGIN(TypeSystemClang)
+LLDB_PLUGIN_DEFINE(TypeSystemClang)
 
 namespace {
 #ifdef LLDB_CONFIGURATION_DEBUG
diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
index 01ef34204e5e9..ba7544fb52dd9 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
@@ -28,7 +28,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(UnwindAssemblyInstEmulation)
+LLDB_PLUGIN_DEFINE(UnwindAssemblyInstEmulation)
 
 //  UnwindAssemblyInstEmulation method definitions
 
diff --git a/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp b/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp
index c49ca465b0a9f..430ba09b811c4 100644
--- a/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp
@@ -30,7 +30,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-LLDB_PLUGIN(UnwindAssembly_x86)
+LLDB_PLUGIN_DEFINE(UnwindAssembly_x86)
 
 //  UnwindAssemblyParser_x86 method definitions
 
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index cd48f93f80ca2..1c75c8ea35be8 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -793,6 +793,10 @@ if(NOT CYGWIN AND NOT WIN32)
     endif()
     add_flag_if_supported("-fdata-sections" FDATA_SECTIONS)
   endif()
+elseif(MSVC)
+  if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
+    append("/Gw" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
 endif()
 
 if(MSVC)
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 5bf9f37384ab2..48d561349108c 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -46,7 +46,7 @@ This is an example workflow and configuration to get and build the LLVM source:
    * ``cd build``
    * ``cmake -G <generator> [options] ../llvm``
 
-     Some common generators are:
+     Some common build system generators are:
 
      * ``Ninja`` --- for generating `Ninja <https://ninja-build.org>`_
        build files. Most llvm developers use Ninja.
@@ -75,9 +75,11 @@ This is an example workflow and configuration to get and build the LLVM source:
      * ``-DLLVM_ENABLE_ASSERTIONS=On`` --- Compile with assertion checks enabled
        (default is Yes for Debug builds, No for all other build types).
 
-   * Run your build tool of choice!
+   * ``cmake --build . [--target <target>]`` or the build system specified
+     above directly.
 
-     * The default target (i.e. ``ninja`` or ``make``) will build all of LLVM.
+     * The default target (i.e. ``cmake --build .`` or ``make``) will build all of
+       LLVM.
 
      * The ``check-all`` target (i.e. ``ninja check-all``) will run the
        regression tests to ensure everything is in working order.
@@ -85,10 +87,10 @@ This is an example workflow and configuration to get and build the LLVM source:
      * CMake will generate build targets for each tool and library, and most
        LLVM sub-projects generate their own ``check-<project>`` target.
 
-     * Running a serial build will be *slow*.  To improve speed, try running a
-       parallel build. That's done by default in Ninja; for ``make``, use
-       ``make -j NNN`` (NNN is the number of parallel jobs, use e.g. number of
-       CPUs you have.)
+     * Running a serial build will be **slow**.  To improve speed, try running a
+       parallel build. That's done by default in Ninja; for ``make``, use the
+       option ``-j NN``, where ``NN`` is the number of parallel jobs, e.g. the
+       number of available CPUs.
 
    * For more information see `CMake <CMake.html>`__
 
diff --git a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp
index 3828e0a5f82b3..0c986d5a3f1c5 100644
--- a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp
+++ b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp
@@ -131,7 +131,7 @@ class SpeculativeJIT {
   std::unique_ptr<ExecutionSession> ES;
   DataLayout DL;
   MangleAndInterner Mangle{*ES, DL};
-  ThreadPool CompileThreads{NumThreads};
+  ThreadPool CompileThreads{llvm::hardware_concurrency(NumThreads)};
 
   JITDylib &MainJD;
 
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp
index 394c1308fd6fa..b920bee6ad14b 100644
--- a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp
+++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp
@@ -262,7 +262,8 @@ void ThinLtoJIT::setupLayers(JITTargetMachineBuilder JTMB,
   OnDemandLayer->setPartitionFunction(CompileOnDemandLayer::compileWholeModule);
 
   // Delegate compilation to the thread pool.
-  CompileThreads = std::make_unique<ThreadPool>(NumCompileThreads);
+  CompileThreads = std::make_unique<ThreadPool>(
+      llvm::hardware_concurrency(NumCompileThreads));
   ES.setDispatchMaterialization(
       [this](JITDylib &JD, std::unique_ptr<MaterializationUnit> MU) {
         if (IsTrivialModule(MU.get())) {
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h
index a6574be5c3973..29a24a0c5e147 100644
--- a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h
+++ b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h
@@ -26,7 +26,7 @@ class ThinLtoModuleIndex {
 public:
   ThinLtoModuleIndex(ExecutionSession &ES, unsigned ParseModuleThreads)
       : ES(ES), CombinedSummaryIndex(HaveGVs),
-        ParseModuleWorkers(ParseModuleThreads),
+        ParseModuleWorkers(llvm::hardware_concurrency(ParseModuleThreads)),
         NumParseModuleThreads(ParseModuleThreads) {}
 
   Error add(StringRef InputPath);
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index 5284be8c4a027..02e01effc0fc9 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -14,6 +14,7 @@
 #define LLVM_ADT_BITVECTOR_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
@@ -719,6 +720,14 @@ class BitVector {
     if (this == &RHS) return *this;
 
     Size = RHS.size();
+
+    // Handle tombstone when the BitVector is a key of a DenseHash.
+    if (RHS.isInvalid()) {
+      std::free(Bits.data());
+      Bits = None;
+      return *this;
+    }
+
     unsigned RHSWords = NumBitWords(Size);
     if (Size <= getBitCapacity()) {
       if (Size)
@@ -758,6 +767,14 @@ class BitVector {
     std::swap(Size, RHS.Size);
   }
 
+  void invalid() {
+    assert(!Size && Bits.empty());
+    Size = (unsigned)-1;
+  }
+  bool isInvalid() const { return Size == (unsigned)-1; }
+
+  ArrayRef<BitWord> getData() const { return Bits; }
+
   //===--------------------------------------------------------------------===//
   // Portable bit mask operations.
   //===--------------------------------------------------------------------===//
@@ -932,6 +949,23 @@ inline size_t capacity_in_bytes(const BitVector &X) {
   return X.getMemorySize();
 }
 
+template <> struct DenseMapInfo<BitVector> {
+  static inline BitVector getEmptyKey() { return BitVector(); }
+  static inline BitVector getTombstoneKey() {
+    BitVector V;
+    V.invalid();
+    return V;
+  }
+  static unsigned getHashValue(const BitVector &V) {
+    return DenseMapInfo<std::pair<unsigned, ArrayRef<uintptr_t>>>::getHashValue(
+        std::make_pair(V.size(), V.getData()));
+  }
+  static bool isEqual(const BitVector &LHS, const BitVector &RHS) {
+    if (LHS.isInvalid() || RHS.isInvalid())
+      return LHS.isInvalid() == RHS.isInvalid();
+    return LHS == RHS;
+  }
+};
 } // end namespace llvm
 
 namespace std {
diff --git a/llvm/include/llvm/ADT/SmallBitVector.h b/llvm/include/llvm/ADT/SmallBitVector.h
index 61375c0080220..b7367d68bdae8 100644
--- a/llvm/include/llvm/ADT/SmallBitVector.h
+++ b/llvm/include/llvm/ADT/SmallBitVector.h
@@ -662,6 +662,16 @@ class SmallBitVector {
       getPointer()->clearBitsNotInMask(Mask, MaskWords);
   }
 
+  void invalid() {
+    assert(empty());
+    X = (uintptr_t)-1;
+  }
+  bool isInvalid() const { return X == (uintptr_t)-1; }
+
+  ArrayRef<uintptr_t> getData() const {
+    return isSmall() ? makeArrayRef(X) : getPointer()->getData();
+  }
+
 private:
   template <bool AddBits, bool InvertMask>
   void applyMask(const uint32_t *Mask, unsigned MaskWords) {
@@ -699,6 +709,23 @@ operator^(const SmallBitVector &LHS, const SmallBitVector &RHS) {
   return Result;
 }
 
+template <> struct DenseMapInfo<SmallBitVector> {
+  static inline SmallBitVector getEmptyKey() { return SmallBitVector(); }
+  static inline SmallBitVector getTombstoneKey() {
+    SmallBitVector V;
+    V.invalid();
+    return V;
+  }
+  static unsigned getHashValue(const SmallBitVector &V) {
+    return DenseMapInfo<std::pair<unsigned, ArrayRef<uintptr_t>>>::getHashValue(
+        std::make_pair(V.size(), V.getData()));
+  }
+  static bool isEqual(const SmallBitVector &LHS, const SmallBitVector &RHS) {
+    if (LHS.isInvalid() || RHS.isInvalid())
+      return LHS.isInvalid() == RHS.isInvalid();
+    return LHS == RHS;
+  }
+};
 } // end namespace llvm
 
 namespace std {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0bdb49edc9830..d3f8896eca162 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1471,6 +1471,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       SingleCallCost = TargetTransformInfo::TCC_Expensive;
       break;
     // FIXME: ctlz, cttz, ...
+    case Intrinsic::bswap:
+      ISDs.push_back(ISD::BSWAP);
+      break;
+    case Intrinsic::bitreverse:
+      ISDs.push_back(ISD::BITREVERSE);
+      break;
     }
 
     const TargetLoweringBase *TLI = getTLI();
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index b4f9b96653c59..af8129b98a2b8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -280,9 +280,36 @@ class LegalizationArtifactCombiner {
     }
   }
 
+  /// Try to replace DstReg with SrcReg or build a COPY instruction
+  /// depending on the register constraints.
+  static void replaceRegOrBuildCopy(Register DstReg, Register SrcReg,
+                                    MachineRegisterInfo &MRI,
+                                    MachineIRBuilder &Builder,
+                                    SmallVectorImpl<Register> &UpdatedDefs,
+                                    GISelObserverWrapper &Observer) {
+    if (!llvm::canReplaceReg(DstReg, SrcReg, MRI)) {
+      Builder.buildCopy(DstReg, SrcReg);
+      UpdatedDefs.push_back(DstReg);
+      return;
+    }
+    SmallVector<MachineInstr *, 4> UseMIs;
+    // Get the users and notify the observer before replacing.
+    for (auto &UseMI : MRI.use_instructions(DstReg)) {
+      UseMIs.push_back(&UseMI);
+      Observer.changingInstr(UseMI);
+    }
+    // Replace the registers.
+    MRI.replaceRegWith(DstReg, SrcReg);
+    UpdatedDefs.push_back(SrcReg);
+    // Notify the observer that we changed the instructions.
+    for (auto *UseMI : UseMIs)
+      Observer.changedInstr(*UseMI);
+  }
+
   bool tryCombineMerges(MachineInstr &MI,
                         SmallVectorImpl<MachineInstr *> &DeadInsts,
-                        SmallVectorImpl<Register> &UpdatedDefs) {
+                        SmallVectorImpl<Register> &UpdatedDefs,
+                        GISelObserverWrapper &Observer) {
     assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
 
     unsigned NumDefs = MI.getNumOperands() - 1;
@@ -395,10 +422,12 @@ class LegalizationArtifactCombiner {
              "Bitcast and the other kinds of conversions should "
              "have happened earlier");
 
+      Builder.setInstr(MI);
       for (unsigned Idx = 0; Idx < NumDefs; ++Idx) {
-        Register NewDef = MergeI->getOperand(Idx + 1).getReg();
-        MRI.replaceRegWith(MI.getOperand(Idx).getReg(), NewDef);
-        UpdatedDefs.push_back(NewDef);
+        Register DstReg = MI.getOperand(Idx).getReg();
+        Register SrcReg = MergeI->getOperand(Idx + 1).getReg();
+        replaceRegOrBuildCopy(DstReg, SrcReg, MRI, Builder, UpdatedDefs,
+                              Observer);
       }
     }
 
@@ -498,7 +527,7 @@ class LegalizationArtifactCombiner {
       Changed = tryCombineSExt(MI, DeadInsts, UpdatedDefs);
       break;
     case TargetOpcode::G_UNMERGE_VALUES:
-      Changed = tryCombineMerges(MI, DeadInsts, UpdatedDefs);
+      Changed = tryCombineMerges(MI, DeadInsts, UpdatedDefs, WrapperObserver);
       break;
     case TargetOpcode::G_EXTRACT:
       Changed = tryCombineExtract(MI, DeadInsts, UpdatedDefs);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 0ef9a713f784e..6f35718902518 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -266,6 +266,10 @@ class LegalizerHelper {
   LegalizeResult lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult lowerFPTOUI(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult lowerFPTOSI(MachineInstr &MI);
+
+  LegalizeResult lowerFPTRUNC_F64_TO_F16(MachineInstr &MI);
+  LegalizeResult lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
+
   LegalizeResult lowerMinMax(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult lowerFCopySign(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index 63c5746bf183f..a88a97c666ad5 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -93,6 +93,11 @@ bool constrainSelectedInstRegOperands(MachineInstr &I,
                                       const TargetInstrInfo &TII,
                                       const TargetRegisterInfo &TRI,
                                       const RegisterBankInfo &RBI);
+
+/// Check if DstReg can be replaced with SrcReg depending on the register
+/// constraints.
+bool canReplaceReg(Register DstReg, Register SrcReg, MachineRegisterInfo &MRI);
+
 /// Check whether an instruction \p MI is dead: it only defines dead virtual
 /// registers, and doesn't have other side effects.
 bool isTriviallyDead(const MachineInstr &MI, const MachineRegisterInfo &MRI);
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPContext.h b/llvm/include/llvm/Frontend/OpenMP/OMPContext.h
index 3b401b72a7d8a..960c557f55d40 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPContext.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPContext.h
@@ -49,6 +49,9 @@ enum class TraitProperty {
 /// Parse \p Str and return the trait set it matches or TraitSet::invalid.
 TraitSet getOpenMPContextTraitSetKind(StringRef Str);
 
+/// Return the trait set for which \p Selector is a selector.
+TraitSet getOpenMPContextTraitSetForSelector(TraitSelector Selector);
+
 /// Return the trait set for which \p Property is a property.
 TraitSet getOpenMPContextTraitSetForProperty(TraitProperty Property);
 
@@ -67,9 +70,7 @@ StringRef getOpenMPContextTraitSelectorName(TraitSelector Kind);
 
 /// Parse \p Str and return the trait set it matches or
 /// TraitProperty::invalid.
-TraitProperty getOpenMPContextTraitPropertyKind(TraitSet Set,
-                                                TraitSelector Selector,
-                                                StringRef Str);
+TraitProperty getOpenMPContextTraitPropertyKind(TraitSet Set, StringRef Str);
 
 /// Return the trait property for a singleton selector \p Selector.
 TraitProperty getOpenMPContextTraitPropertyForSelector(TraitSelector Selector);
@@ -80,6 +81,16 @@ StringRef getOpenMPContextTraitPropertyName(TraitProperty Kind);
 /// Return a textual representation of the trait property \p Kind with selector
 /// and set name included.
 StringRef getOpenMPContextTraitPropertyFullName(TraitProperty Kind);
+
+/// Return a string listing all trait sets.
+std::string listOpenMPContextTraitSets();
+
+/// Return a string listing all trait selectors for \p Set.
+std::string listOpenMPContextTraitSelectors(TraitSet Set);
+
+/// Return a string listing all trait properties for \p Set and \p Selector.
+std::string listOpenMPContextTraitProperties(TraitSet Set,
+                                             TraitSelector Selector);
 ///}
 
 /// Return true if \p Selector can be nested in \p Set. Also sets
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index e864e05df3407..df28533456827 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -227,7 +227,8 @@ using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>(
     AddStreamFn AddStream, NativeObjectCache Cache)>;
 
 /// This ThinBackend runs the individual backend jobs in-process.
-ThinBackend createInProcessThinBackend(unsigned ParallelismLevel);
+/// The default value means to use one job per hardware core (not hyper-thread).
+ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0);
 
 /// This ThinBackend writes individual module indexes to files, instead of
 /// running the individual backend jobs. This backend is for distributed builds
diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h
index 4bcbaa3142fd4..2036f46c6d561 100644
--- a/llvm/include/llvm/Support/ThreadPool.h
+++ b/llvm/include/llvm/Support/ThreadPool.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_SUPPORT_THREAD_POOL_H
 #define LLVM_SUPPORT_THREAD_POOL_H
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/thread.h"
 
 #include <future>
@@ -38,12 +40,11 @@ class ThreadPool {
   using TaskTy = std::function<void()>;
   using PackagedTaskTy = std::packaged_task<void()>;
 
-  /// Construct a pool with the number of threads found by
-  /// hardware_concurrency().
-  ThreadPool();
-
-  /// Construct a pool of \p ThreadCount threads
-  ThreadPool(unsigned ThreadCount);
+  /// Construct a pool using the hardware strategy \p S for mapping hardware
+  /// execution resources (threads, cores, CPUs)
+  /// Defaults to using the maximum execution resources in the system, but
+  /// excluding any resources contained in the affinity mask.
+  ThreadPool(ThreadPoolStrategy S = hardware_concurrency());
 
   /// Blocking destructor: the pool will wait for all the threads to complete.
   ~ThreadPool();
@@ -68,6 +69,8 @@ class ThreadPool {
   /// It is an error to try to add new tasks while blocking on this call.
   void wait();
 
+  unsigned getThreadCount() const { return ThreadCount; }
+
 private:
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
@@ -94,6 +97,8 @@ class ThreadPool {
   /// Signal for the destruction of the pool, asking thread to exit.
   bool EnableFlag;
 #endif
+
+  unsigned ThreadCount;
 };
 }
 
diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index bacab8fa23b6d..d3d4a37e69c66 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_THREADING_H
 #define LLVM_SUPPORT_THREADING_H
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
@@ -143,20 +144,52 @@ void llvm_execute_on_thread_async(
 #endif
   }
 
-  /// Get the amount of currency to use for tasks requiring significant
-  /// memory or other resources. Currently based on physical cores, if
-  /// available for the host system, otherwise falls back to
-  /// thread::hardware_concurrency().
-  /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF
-  unsigned heavyweight_hardware_concurrency();
-
-  /// Get the number of threads that the current program can execute
-  /// concurrently. On some systems std::thread::hardware_concurrency() returns
-  /// the total number of cores, without taking affinity into consideration.
-  /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF.
-  /// Fallback to std::thread::hardware_concurrency() if sched_getaffinity is
-  /// not available.
-  unsigned hardware_concurrency();
+  /// This tells how a thread pool will be used
+  class ThreadPoolStrategy {
+  public:
+    // The default value (0) means all available threads should be used,
+    // excluding affinity mask. If set, this value only represents a suggested
+    // high bound, the runtime might choose a lower value (not higher).
+    unsigned ThreadsRequested = 0;
+
+    // If SMT is active, use hyper threads. If false, there will be only one
+    // std::thread per core.
+    bool UseHyperThreads = true;
+
+    /// Retrieves the max available threads for the current strategy. This
+    /// accounts for affinity masks and takes advantage of all CPU sockets.
+    unsigned compute_thread_count() const;
+
+    /// Assign the current thread to an ideal hardware CPU or NUMA node. In a
+    /// multi-socket system, this ensures threads are assigned to all CPU
+    /// sockets. \p ThreadPoolNum represents a number bounded by [0,
+    /// compute_thread_count()).
+    void apply_thread_strategy(unsigned ThreadPoolNum) const;
+  };
+
+  /// Returns a thread strategy for tasks requiring significant memory or other
+  /// resources. To be used for workloads where hardware_concurrency() proves to
+  /// be less efficient. Avoid this strategy if doing lots of I/O. Currently
+  /// based on physical cores, if available for the host system, otherwise falls
+  /// back to hardware_concurrency(). Returns 1 when LLVM is configured with
+  /// LLVM_ENABLE_THREADS = OFF
+  inline ThreadPoolStrategy
+  heavyweight_hardware_concurrency(unsigned ThreadCount = 0) {
+    ThreadPoolStrategy S;
+    S.UseHyperThreads = false;
+    S.ThreadsRequested = ThreadCount;
+    return S;
+  }
+
+  /// Returns a default thread strategy where all available hardware ressources
+  /// are to be used, except for those initially excluded by an affinity mask.
+  /// This function takes affinity into consideration. Returns 1 when LLVM is
+  /// configured with LLVM_ENABLE_THREADS=OFF.
+  inline ThreadPoolStrategy hardware_concurrency(unsigned ThreadCount = 0) {
+    ThreadPoolStrategy S;
+    S.ThreadsRequested = ThreadCount;
+    return S;
+  }
 
   /// Return the current thread id, as used in various OS system calls.
   /// Note that not all platforms guarantee that the value returned will be
@@ -184,6 +217,14 @@ void llvm_execute_on_thread_async(
   /// the operation succeeded or failed is returned.
   void get_thread_name(SmallVectorImpl<char> &Name);
 
+  /// Returns a mask that represents on which hardware thread, core, CPU, NUMA
+  /// group, the calling thread can be executed. On Windows, threads cannot
+  /// cross CPU boundaries.
+  llvm::BitVector get_thread_affinity_mask();
+
+  /// Returns how many physical CPUs or NUMA groups the system has.
+  unsigned get_cpus();
+
   enum class ThreadPriority {
     Background = 0,
     Default = 1,
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index ce9944a5ce4be..450595cac57b4 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -902,11 +902,6 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock(
     Instruction *QueryInst, const MemoryLocation &Loc, bool isLoad,
     BasicBlock *BB, NonLocalDepInfo *Cache, unsigned NumSortedEntries) {
 
-  bool isInvariantLoad = false;
-
-  if (LoadInst *LI = dyn_cast_or_null<LoadInst>(QueryInst))
-    isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load);
-
   // Do a binary search to see if we already have an entry for this block in
   // the cache set.  If so, find it.
   NonLocalDepInfo::iterator Entry = std::upper_bound(
@@ -918,13 +913,6 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock(
   if (Entry != Cache->begin() + NumSortedEntries && Entry->getBB() == BB)
     ExistingResult = &*Entry;
 
-  // Use cached result for invariant load only if there is no dependency for non
-  // invariant load. In this case invariant load can not have any dependency as
-  // well.
-  if (ExistingResult && isInvariantLoad &&
-      !ExistingResult->getResult().isNonFuncLocal())
-    ExistingResult = nullptr;
-
   // If we have a cached entry, and it is non-dirty, use it as the value for
   // this dependency.
   if (ExistingResult && !ExistingResult->getResult().isDirty()) {
@@ -953,10 +941,6 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock(
   MemDepResult Dep =
       getPointerDependencyFrom(Loc, isLoad, ScanPos, BB, QueryInst);
 
-  // Don't cache results for invariant load.
-  if (isInvariantLoad)
-    return Dep;
-
   // If we had a dirty entry for the block, update it.  Otherwise, just add
   // a new entry.
   if (ExistingResult)
@@ -1045,10 +1029,6 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
   InitialNLPI.Size = Loc.Size;
   InitialNLPI.AATags = Loc.AATags;
 
-  bool isInvariantLoad = false;
-  if (LoadInst *LI = dyn_cast_or_null<LoadInst>(QueryInst))
-    isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load);
-
   // Get the NLPI for CacheKey, inserting one into the map if it doesn't
   // already have one.
   std::pair<CachedNonLocalPointerInfo::iterator, bool> Pair =
@@ -1057,8 +1037,7 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
 
   // If we already have a cache entry for this CacheKey, we may need to do some
   // work to reconcile the cache entry and the current query.
-  // Invariant loads don't participate in caching. Thus no need to reconcile.
-  if (!isInvariantLoad && !Pair.second) {
+  if (!Pair.second) {
     if (CacheInfo->Size != Loc.Size) {
       bool ThrowOutEverything;
       if (CacheInfo->Size.hasValue() && Loc.Size.hasValue()) {
@@ -1114,10 +1093,7 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
 
   // If we have valid cached information for exactly the block we are
   // investigating, just return it with no recomputation.
-  // Don't use cached information for invariant loads since it is valid for
-  // non-invariant loads only.
-  if (!isInvariantLoad &&
-      CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) {
+  if (CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) {
     // We have a fully cached result for this query then we can just return the
     // cached results and populate the visited set.  However, we have to verify
     // that we don't already have conflicting results for these blocks.  Check
@@ -1153,18 +1129,14 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     return true;
   }
 
-  // Invariant loads don't affect cache in any way thus no need to update
-  // CacheInfo as well.
-  if (!isInvariantLoad) {
-    // Otherwise, either this is a new block, a block with an invalid cache
-    // pointer or one that we're about to invalidate by putting more info into
-    // it than its valid cache info.  If empty, the result will be valid cache
-    // info, otherwise it isn't.
-    if (Cache->empty())
-      CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock);
-    else
-      CacheInfo->Pair = BBSkipFirstBlockPair();
-  }
+  // Otherwise, either this is a new block, a block with an invalid cache
+  // pointer or one that we're about to invalidate by putting more info into it
+  // than its valid cache info.  If empty, the result will be valid cache info,
+  // otherwise it isn't.
+  if (Cache->empty())
+    CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock);
+  else
+    CacheInfo->Pair = BBSkipFirstBlockPair();
 
   SmallVector<BasicBlock *, 32> Worklist;
   Worklist.push_back(StartBB);
@@ -1405,26 +1377,22 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     if (SkipFirstBlock)
       return false;
 
-    // Results of invariant loads are not cached thus no need to update cached
-    // information.
-    if (!isInvariantLoad) {
-      bool foundBlock = false;
-      for (NonLocalDepEntry &I : llvm::reverse(*Cache)) {
-        if (I.getBB() != BB)
-          continue;
+    bool foundBlock = false;
+    for (NonLocalDepEntry &I : llvm::reverse(*Cache)) {
+      if (I.getBB() != BB)
+        continue;
 
-        assert((GotWorklistLimit || I.getResult().isNonLocal() ||
-                !DT.isReachableFromEntry(BB)) &&
-               "Should only be here with transparent block");
-        foundBlock = true;
-        I.setResult(MemDepResult::getUnknown());
-        Result.push_back(
-            NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr()));
-        break;
-      }
-      (void)foundBlock; (void)GotWorklistLimit;
-      assert((foundBlock || GotWorklistLimit) && "Current block not in cache?");
+      assert((GotWorklistLimit || I.getResult().isNonLocal() ||
+              !DT.isReachableFromEntry(BB)) &&
+             "Should only be here with transparent block");
+      foundBlock = true;
+      I.setResult(MemDepResult::getUnknown());
+      Result.push_back(
+          NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr()));
+      break;
     }
+    (void)foundBlock; (void)GotWorklistLimit;
+    assert((foundBlock || GotWorklistLimit) && "Current block not in cache?");
   }
 
   // Okay, we're done now.  If we added new values to the cache, re-sort it.
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 38fbac264430f..02aa2b36783d5 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1705,13 +1705,16 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   const Function &F = MF.getFunction();
 
   // Get the function symbol.
-  if (TM.getTargetTriple().isOSAIX()) {
+  if (!MAI->needsFunctionDescriptors()) {
+    CurrentFnSym = getSymbol(&MF.getFunction());
+  } else {
+    assert(TM.getTargetTriple().isOSAIX() &&
+           "Only AIX uses the function descriptor hooks.");
     // AIX is unique here in that the name of the symbol emitted for the
     // function body does not have the same name as the source function's
     // C-linkage name.
-    assert(MAI->needsFunctionDescriptors() && "AIX ABI is descriptor based.");
     assert(CurrentFnDescSym && "The function descriptor symbol needs to be"
-		                           " initalized first.");
+                               " initalized first.");
 
     // Get the function entry point symbol.
     CurrentFnSym =
@@ -1721,8 +1724,6 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
     MCSectionXCOFF *FnEntryPointSec =
         cast<MCSectionXCOFF>(getObjFileLowering().SectionForGlobal(&F, TM));
     cast<MCSymbolXCOFF>(CurrentFnSym)->setContainingCsect(FnEntryPointSec);
-  } else {
-    CurrentFnSym = getSymbol(&MF.getFunction());
   }
 
   CurrentFnSymForSize = CurrentFnSym;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index cc8f5a10ca07f..79fbe1db9d3fd 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -75,36 +75,7 @@ bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
     return false;
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
-
-  // Give up if either DstReg or SrcReg  is a physical register.
-  if (Register::isPhysicalRegister(DstReg) ||
-      Register::isPhysicalRegister(SrcReg))
-    return false;
-
-  // Give up the types don't match.
-  LLT DstTy = MRI.getType(DstReg);
-  LLT SrcTy = MRI.getType(SrcReg);
-  // Give up if one has a valid LLT, but the other doesn't.
-  if (DstTy.isValid() != SrcTy.isValid())
-    return false;
-  // Give up if the types don't match.
-  if (DstTy.isValid() && SrcTy.isValid() && DstTy != SrcTy)
-    return false;
-
-  // Get the register banks and classes.
-  const RegisterBank *DstBank = MRI.getRegBankOrNull(DstReg);
-  const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
-  const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg);
-  const TargetRegisterClass *SrcRC = MRI.getRegClassOrNull(SrcReg);
-
-  // Replace if the register constraints match.
-  if ((SrcRC == DstRC) && (SrcBank == DstBank))
-    return true;
-  // Replace if DstReg has no constraints.
-  if (!DstBank && !DstRC)
-    return true;
-
-  return false;
+  return canReplaceReg(DstReg, SrcReg, MRI);
 }
 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
   Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 5396fcfc4824f..3af0705dff854 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2487,6 +2487,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return lowerFPTOUI(MI, TypeIdx, Ty);
   case G_FPTOSI:
     return lowerFPTOSI(MI);
+  case G_FPTRUNC:
+    return lowerFPTRUNC(MI, TypeIdx, Ty);
   case G_SMIN:
   case G_SMAX:
   case G_UMIN:
@@ -4476,6 +4478,128 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
   return Legalized;
 }
 
+// f64 -> f16 conversion using round-to-nearest-even rounding mode.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+
+  if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
+    return UnableToLegalize;
+
+  const unsigned ExpMask = 0x7ff;
+  const unsigned ExpBiasf64 = 1023;
+  const unsigned ExpBiasf16 = 15;
+  const LLT S32 = LLT::scalar(32);
+  const LLT S1 = LLT::scalar(1);
+
+  auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
+  Register U = Unmerge.getReg(0);
+  Register UH = Unmerge.getReg(1);
+
+  auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
+
+  // Subtract the fp64 exponent bias (1023) to get the real exponent and
+  // add the f16 bias (15) to get the biased exponent for the f16 format.
+  E = MIRBuilder.buildAdd(
+    S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
+  E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
+
+  auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
+  M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
+
+  auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
+                                       MIRBuilder.buildConstant(S32, 0x1ff));
+  MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
+
+  auto Zero = MIRBuilder.buildConstant(S32, 0);
+  auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
+  auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
+  M = MIRBuilder.buildOr(S32, M, Lo40Set);
+
+  // (M != 0 ? 0x0200 : 0) | 0x7c00;
+  auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
+  auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
+  auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
+
+  auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
+  auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
+
+  // N = M | (E << 12);
+  auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
+  auto N = MIRBuilder.buildOr(S32, M, EShl12);
+
+  // B = clamp(1-E, 0, 13);
+  auto One = MIRBuilder.buildConstant(S32, 1);
+  auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
+  auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
+  B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
+
+  auto SigSetHigh = MIRBuilder.buildOr(S32, M,
+                                       MIRBuilder.buildConstant(S32, 0x1000));
+
+  auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
+  auto D0 = MIRBuilder.buildShl(S32, D, B);
+
+  auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
+                                             D0, SigSetHigh);
+  auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
+  D = MIRBuilder.buildOr(S32, D, D1);
+
+  auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
+  auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
+
+  auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
+  V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
+
+  auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
+                                       MIRBuilder.buildConstant(S32, 3));
+  auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
+
+  auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
+                                       MIRBuilder.buildConstant(S32, 5));
+  auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
+
+  V1 = MIRBuilder.buildOr(S32, V0, V1);
+  V = MIRBuilder.buildAdd(S32, V, V1);
+
+  auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
+                                       E, MIRBuilder.buildConstant(S32, 30));
+  V = MIRBuilder.buildSelect(S32, CmpEGt30,
+                             MIRBuilder.buildConstant(S32, 0x7c00), V);
+
+  auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
+                                         E, MIRBuilder.buildConstant(S32, 1039));
+  V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
+
+  // Extract the sign bit.
+  auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
+  Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
+
+  // Insert the sign bit
+  V = MIRBuilder.buildOr(S32, Sign, V);
+
+  MIRBuilder.buildTrunc(Dst, V);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+  const LLT S64 = LLT::scalar(64);
+  const LLT S16 = LLT::scalar(16);
+
+  if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
+    return lowerFPTRUNC_F64_TO_F16(MI);
+
+  return UnableToLegalize;
+}
+
 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
   switch (Opc) {
   case TargetOpcode::G_SMIN:
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 5f72974b31ec3..d29e9546be0bf 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -155,6 +155,20 @@ bool llvm::constrainSelectedInstRegOperands(MachineInstr &I,
   return true;
 }
 
+bool llvm::canReplaceReg(Register DstReg, Register SrcReg,
+                         MachineRegisterInfo &MRI) {
+  // Give up if either DstReg or SrcReg  is a physical register.
+  if (DstReg.isPhysical() || SrcReg.isPhysical())
+    return false;
+  // Give up if the types don't match.
+  if (MRI.getType(DstReg) != MRI.getType(SrcReg))
+    return false;
+  // Replace if either DstReg has no constraints or the register
+  // constraints match.
+  return !MRI.getRegClassOrRegBank(DstReg) ||
+         MRI.getRegClassOrRegBank(DstReg) == MRI.getRegClassOrRegBank(SrcReg);
+}
+
 bool llvm::isTriviallyDead(const MachineInstr &MI,
                            const MachineRegisterInfo &MRI) {
   // If we can move an instruction, we can remove it.  Otherwise, it has
diff --git a/llvm/lib/CodeGen/ParallelCG.cpp b/llvm/lib/CodeGen/ParallelCG.cpp
index 7dbd830666fb8..c19ed1f8f71da 100644
--- a/llvm/lib/CodeGen/ParallelCG.cpp
+++ b/llvm/lib/CodeGen/ParallelCG.cpp
@@ -51,7 +51,7 @@ std::unique_ptr<Module> llvm::splitCodeGen(
   // Create ThreadPool in nested scope so that threads will be joined
   // on destruction.
   {
-    ThreadPool CodegenThreadPool(OSs.size());
+    ThreadPool CodegenThreadPool(hardware_concurrency(OSs.size()));
     int ThreadCount = 0;
 
     SplitModule(
diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp
index 03919c805130c..715ad24b55214 100644
--- a/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -2446,7 +2446,7 @@ bool DWARFLinker::link() {
     }
     EmitLambda();
   } else {
-    ThreadPool Pool(2);
+    ThreadPool Pool(hardware_concurrency(2));
     Pool.async(AnalyzeAll);
     Pool.async(CloneAll);
     Pool.wait();
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index e01b6b6ebc0cc..c3bf71f21cda2 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -445,7 +445,7 @@ Error DwarfTransformer::convert(uint32_t NumThreads) {
 
     // Now parse all DIEs in case we have cross compile unit references in a
     // thread pool.
-    ThreadPool pool(NumThreads);
+    ThreadPool pool(hardware_concurrency(NumThreads));
     for (const auto &CU : DICtx.compile_units())
       pool.async([&CU]() { CU->getUnitDIE(false /*CUDieOnly*/); });
     pool.wait();
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index f81e584b3b2dc..4218ca4e481f7 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -157,7 +157,8 @@ LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)
 
   if (S.NumCompileThreads > 0) {
     TransformLayer->setCloneToNewContextOnEmit(true);
-    CompileThreads = std::make_unique<ThreadPool>(S.NumCompileThreads);
+    CompileThreads =
+        std::make_unique<ThreadPool>(hardware_concurrency(S.NumCompileThreads));
     ES->setDispatchMaterialization(
         [this](JITDylib &JD, std::unique_ptr<MaterializationUnit> MU) {
           // FIXME: Switch to move capture once we have c++14.
diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp
index 7bdc16af9014c..f4c4bdfad0b64 100644
--- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp
@@ -286,6 +286,16 @@ TraitSet llvm::omp::getOpenMPContextTraitSetKind(StringRef S) {
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
       .Default(TraitSet::invalid);
 }
+
+TraitSet
+llvm::omp::getOpenMPContextTraitSetForSelector(TraitSelector Selector) {
+  switch (Selector) {
+#define OMP_TRAIT_SELECTOR(Enum, TraitSetEnum, Str, ReqProp)                   \
+  case TraitSelector::Enum:                                                    \
+    return TraitSet::TraitSetEnum;
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+}
 TraitSet
 llvm::omp::getOpenMPContextTraitSetForProperty(TraitProperty Property) {
   switch (Property) {
@@ -333,11 +343,10 @@ StringRef llvm::omp::getOpenMPContextTraitSelectorName(TraitSelector Kind) {
   llvm_unreachable("Unknown trait selector!");
 }
 
-TraitProperty llvm::omp::getOpenMPContextTraitPropertyKind(
-    TraitSet Set, TraitSelector Selector, StringRef S) {
+TraitProperty llvm::omp::getOpenMPContextTraitPropertyKind(TraitSet Set,
+                                                           StringRef S) {
 #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
-  if (Set == TraitSet::TraitSetEnum &&                                         \
-      Selector == TraitSelector::TraitSelectorEnum && Str == S)                \
+  if (Set == TraitSet::TraitSetEnum && Str == S)                               \
     return TraitProperty::Enum;
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
   return TraitProperty::invalid;
@@ -398,3 +407,36 @@ bool llvm::omp::isValidTraitPropertyForTraitSetAndSelector(
   }
   llvm_unreachable("Unknown trait property!");
 }
+
+std::string llvm::omp::listOpenMPContextTraitSets() {
+  std::string S;
+#define OMP_TRAIT_SET(Enum, Str)                                               \
+  if (Str != "invalid")                                                        \
+    S.append("'").append(Str).append("'").append(" ");
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  S.pop_back();
+  return S;
+}
+
+std::string llvm::omp::listOpenMPContextTraitSelectors(TraitSet Set) {
+  std::string S;
+#define OMP_TRAIT_SELECTOR(Enum, TraitSetEnum, Str, ReqProp)                   \
+  if (TraitSet::TraitSetEnum == Set && Str != "Invalid")                       \
+    S.append("'").append(Str).append("'").append(" ");
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  S.pop_back();
+  return S;
+}
+
+std::string
+llvm::omp::listOpenMPContextTraitProperties(TraitSet Set,
+                                            TraitSelector Selector) {
+  std::string S;
+#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
+  if (TraitSet::TraitSetEnum == Set &&                                         \
+      TraitSelector::TraitSelectorEnum == Selector && Str != "invalid")        \
+    S.append("'").append(Str).append("'").append(" ");
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  S.pop_back();
+  return S;
+}
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index 0a43e17c358c8..d92943d6975b8 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -31,10 +31,8 @@ static cl::opt<bool> PropagateAttrs("propagate-attrs", cl::init(true),
                                     cl::Hidden,
                                     cl::desc("Propagate attributes in index"));
 
-// FIXME: Enable again when thin link compile time regressions understood and
-// addressed
 static cl::opt<bool> ImportConstantsWithRefs(
-    "import-constants-with-refs", cl::init(false), cl::Hidden,
+    "import-constants-with-refs", cl::init(true), cl::Hidden,
     cl::desc("Import constant global variables with references"));
 
 FunctionSummary FunctionSummary::ExternalNode =
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 1d23c6bab36d5..f8affcb20ceff 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -477,8 +477,7 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
 LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
     : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
   if (!Backend)
-    this->Backend =
-        createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
+    this->Backend = createInProcessThinBackend();
 }
 
 LTO::LTO(Config Conf, ThinBackend Backend,
@@ -1095,7 +1094,8 @@ class InProcessThinBackend : public ThinBackendProc {
       const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
       AddStreamFn AddStream, NativeObjectCache Cache)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
-        BackendThreadPool(ThinLTOParallelismLevel),
+        BackendThreadPool(
+            heavyweight_hardware_concurrency(ThinLTOParallelismLevel)),
         AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
     for (auto &Name : CombinedIndex.cfiFunctionDefs())
       CfiFunctionDefs.insert(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index b85471555b092..ec57744cf4803 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -375,7 +375,8 @@ void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
 void splitCodeGen(const Config &C, TargetMachine *TM, AddStreamFn AddStream,
                   unsigned ParallelCodeGenParallelismLevel,
                   std::unique_ptr<Module> Mod) {
-  ThreadPool CodegenThreadPool(ParallelCodeGenParallelismLevel);
+  ThreadPool CodegenThreadPool(
+      heavyweight_hardware_concurrency(ParallelCodeGenParallelismLevel));
   unsigned ThreadCount = 0;
   const Target *T = &TM->getTarget();
 
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index a4f270240005c..152f0afcf12ea 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -80,8 +80,8 @@ extern cl::opt<std::string> RemarksFormat;
 
 namespace {
 
-static cl::opt<int>
-    ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency()));
+// Default to using one job per hardware core in the system
+static cl::opt<int> ThreadCount("threads", cl::init(0));
 
 // Simple helper to save temporary files for debug.
 static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
@@ -1042,7 +1042,7 @@ void ThinLTOCodeGenerator::run() {
 
   // Parallel optimizer + codegen
   {
-    ThreadPool Pool(ThreadCount);
+    ThreadPool Pool(heavyweight_hardware_concurrency(ThreadCount));
     for (auto IndexCount : ModulesOrdering) {
       auto &Mod = Modules[IndexCount];
       Pool.async([&](int count) {
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index ef38c1c09413a..7e772b2b1378a 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -1266,7 +1266,7 @@ StringRef sys::getHostCPUName() { return "generic"; }
 // On Linux, the number of physical cores can be computed from /proc/cpuinfo,
 // using the number of unique physical/core id pairs. The following
 // implementation reads the /proc/cpuinfo format on an x86_64 system.
-static int computeHostNumPhysicalCores() {
+int computeHostNumPhysicalCores() {
   // Read /proc/cpuinfo as a stream (until EOF reached). It cannot be
   // mmapped because it appears to have 0 size.
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
@@ -1312,7 +1312,7 @@ static int computeHostNumPhysicalCores() {
 #include <sys/sysctl.h>
 
 // Gets the number of *physical cores* on the machine.
-static int computeHostNumPhysicalCores() {
+int computeHostNumPhysicalCores() {
   uint32_t count;
   size_t len = sizeof(count);
   sysctlbyname("hw.physicalcpu", &count, &len, NULL, 0);
@@ -1326,6 +1326,9 @@ static int computeHostNumPhysicalCores() {
   }
   return count;
 }
+#elif defined(_WIN32)
+// Defined in llvm/lib/Support/Windows/Threading.inc
+int computeHostNumPhysicalCores();
 #else
 // On other systems, return -1 to indicate unknown.
 static int computeHostNumPhysicalCores() { return -1; }
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 523665d14b029..0272a53beb393 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -39,20 +39,21 @@ class Executor {
 ///   in filo order.
 class ThreadPoolExecutor : public Executor {
 public:
-  explicit ThreadPoolExecutor(unsigned ThreadCount = hardware_concurrency()) {
+  explicit ThreadPoolExecutor(ThreadPoolStrategy S = hardware_concurrency()) {
+    unsigned ThreadCount = S.compute_thread_count();
     // Spawn all but one of the threads in another thread as spawning threads
     // can take a while.
     Threads.reserve(ThreadCount);
     Threads.resize(1);
     std::lock_guard<std::mutex> Lock(Mutex);
-    Threads[0] = std::thread([&, ThreadCount] {
-      for (unsigned i = 1; i < ThreadCount; ++i) {
-        Threads.emplace_back([=] { work(); });
+    Threads[0] = std::thread([this, ThreadCount, S] {
+      for (unsigned I = 1; I < ThreadCount; ++I) {
+        Threads.emplace_back([=] { work(S, I); });
         if (Stop)
           break;
       }
       ThreadsCreated.set_value();
-      work();
+      work(S, 0);
     });
   }
 
@@ -90,7 +91,8 @@ class ThreadPoolExecutor : public Executor {
   }
 
 private:
-  void work() {
+  void work(ThreadPoolStrategy S, unsigned ThreadID) {
+    S.apply_thread_strategy(ThreadID);
     while (true) {
       std::unique_lock<std::mutex> Lock(Mutex);
       Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp
index 40982d777914d..5aa5815d7272c 100644
--- a/llvm/lib/Support/ThreadPool.cpp
+++ b/llvm/lib/Support/ThreadPool.cpp
@@ -20,16 +20,15 @@ using namespace llvm;
 
 #if LLVM_ENABLE_THREADS
 
-// Default to hardware_concurrency
-ThreadPool::ThreadPool() : ThreadPool(hardware_concurrency()) {}
-
-ThreadPool::ThreadPool(unsigned ThreadCount)
-    : ActiveThreads(0), EnableFlag(true) {
+ThreadPool::ThreadPool(ThreadPoolStrategy S)
+    : ActiveThreads(0), EnableFlag(true),
+      ThreadCount(S.compute_thread_count()) {
   // Create ThreadCount threads that will loop forever, wait on QueueCondition
   // for tasks to be queued or the Pool to be destroyed.
   Threads.reserve(ThreadCount);
   for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) {
-    Threads.emplace_back([&] {
+    Threads.emplace_back([S, ThreadID, this] {
+      S.apply_thread_strategy(ThreadID);
       while (true) {
         PackagedTaskTy Task;
         {
@@ -108,12 +107,10 @@ ThreadPool::~ThreadPool() {
 
 #else // LLVM_ENABLE_THREADS Disabled
 
-ThreadPool::ThreadPool() : ThreadPool(0) {}
-
 // No threads are launched, issue a warning if ThreadCount is not 0
-ThreadPool::ThreadPool(unsigned ThreadCount)
-    : ActiveThreads(0) {
-  if (ThreadCount) {
+ThreadPool::ThreadPool(ThreadPoolStrategy S)
+    : ActiveThreads(0), ThreadCount(S.compute_thread_count()) {
+  if (ThreadCount != 1) {
     errs() << "Warning: request a ThreadPool with " << ThreadCount
            << " threads, but LLVM_ENABLE_THREADS has been turned off\n";
   }
@@ -138,8 +135,6 @@ std::shared_future<void> ThreadPool::asyncImpl(TaskTy Task) {
   return Future;
 }
 
-ThreadPool::~ThreadPool() {
-  wait();
-}
+ThreadPool::~ThreadPool() { wait(); }
 
 #endif
diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp
index 48750cef5ec22..de5adaddd9d38 100644
--- a/llvm/lib/Support/Threading.cpp
+++ b/llvm/lib/Support/Threading.cpp
@@ -45,10 +45,6 @@ void llvm::llvm_execute_on_thread(void (*Fn)(void *), void *UserData,
   Fn(UserData);
 }
 
-unsigned llvm::heavyweight_hardware_concurrency() { return 1; }
-
-unsigned llvm::hardware_concurrency() { return 1; }
-
 uint64_t llvm::get_threadid() { return 0; }
 
 uint32_t llvm::get_max_thread_name_length() { return 0; }
@@ -57,6 +53,13 @@ void llvm::set_thread_name(const Twine &Name) {}
 
 void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); }
 
+llvm::BitVector llvm::get_thread_affinity_mask() { return {}; }
+
+unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  // When threads are disabled, ensure clients will loop at least once.
+  return 1;
+}
+
 #if LLVM_ENABLE_THREADS == 0
 void llvm::llvm_execute_on_thread_async(
     llvm::unique_function<void()> Func,
@@ -78,30 +81,19 @@ void llvm::llvm_execute_on_thread_async(
 
 #else
 
-#include <thread>
-unsigned llvm::heavyweight_hardware_concurrency() {
-  // Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use
-  // `std::thread` directly instead of `llvm::thread` (and indeed, doing so
-  // allows us to not define `thread` in the llvm namespace, which conflicts
-  // with some platforms such as FreeBSD whose headers also define a struct
-  // called `thread` in the global namespace which can cause ambiguity due to
-  // ADL.
-  int NumPhysical = sys::getHostNumPhysicalCores();
-  if (NumPhysical == -1)
-    return std::thread::hardware_concurrency();
-  return NumPhysical;
-}
+int computeHostNumHardwareThreads();
 
-unsigned llvm::hardware_concurrency() {
-#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
-  cpu_set_t Set;
-  if (sched_getaffinity(0, sizeof(Set), &Set))
-    return CPU_COUNT(&Set);
-#endif
-  // Guard against std::thread::hardware_concurrency() returning 0.
-  if (unsigned Val = std::thread::hardware_concurrency())
-    return Val;
-  return 1;
+unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads()
+                                       : sys::getHostNumPhysicalCores();
+  if (MaxThreadCount <= 0)
+    MaxThreadCount = 1;
+
+  // No need to create more threads than there are hardware threads, it would
+  // uselessly induce more context-switching and cache eviction.
+  if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount)
+    return MaxThreadCount;
+  return ThreadsRequested;
 }
 
 namespace {
diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc
index afb887fc10960..8cacaa83e961a 100644
--- a/llvm/lib/Support/Unix/Threading.inc
+++ b/llvm/lib/Support/Unix/Threading.inc
@@ -267,3 +267,27 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
 #endif
   return SetThreadPriorityResult::FAILURE;
 }
+
+#include <thread>
+
+int computeHostNumHardwareThreads() {
+#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
+  cpu_set_t Set;
+  if (sched_getaffinity(0, sizeof(Set), &Set))
+    return CPU_COUNT(&Set);
+#endif
+  // Guard against std::thread::hardware_concurrency() returning 0.
+  if (unsigned Val = std::thread::hardware_concurrency())
+    return Val;
+  return 1;
+}
+
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+    unsigned ThreadPoolNum) const {}
+
+llvm::BitVector llvm::get_thread_affinity_mask() {
+  // FIXME: Implement
+  llvm_unreachable("Not implemented!");
+}
+
+unsigned llvm::get_cpus() { return 1; }
diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc
index 9456efa686ffc..eb92296212263 100644
--- a/llvm/lib/Support/Windows/Threading.inc
+++ b/llvm/lib/Support/Windows/Threading.inc
@@ -16,6 +16,8 @@
 #include "WindowsSupport.h"
 #include <process.h>
 
+#include <bitset>
+
 // Windows will at times define MemoryFence.
 #ifdef MemoryFence
 #undef MemoryFence
@@ -122,3 +124,163 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
              ? SetThreadPriorityResult::SUCCESS
              : SetThreadPriorityResult::FAILURE;
 }
+
+struct ProcessorGroup {
+  unsigned ID;
+  unsigned AllThreads;
+  unsigned UsableThreads;
+  unsigned ThreadsPerCore;
+  uint64_t Affinity;
+};
+
+template <typename F>
+static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) {
+  DWORD Len = 0;
+  BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len);
+  if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+    return false;
+  }
+  auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len);
+  R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len);
+  if (R) {
+    auto *End =
+        (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Info + Len);
+    for (auto *Curr = Info; Curr < End;
+         Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Curr +
+                                                            Curr->Size)) {
+      if (Curr->Relationship != Relationship)
+        continue;
+      Fn(Curr);
+    }
+  }
+  free(Info);
+  return true;
+}
+
+static ArrayRef<ProcessorGroup> getProcessorGroups() {
+  auto computeGroups = []() {
+    SmallVector<ProcessorGroup, 4> Groups;
+
+    auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
+      GROUP_RELATIONSHIP &El = ProcInfo->Group;
+      for (unsigned J = 0; J < El.ActiveGroupCount; ++J) {
+        ProcessorGroup G;
+        G.ID = Groups.size();
+        G.AllThreads = El.GroupInfo[J].MaximumProcessorCount;
+        G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount;
+        assert(G.UsableThreads <= 64);
+        G.Affinity = El.GroupInfo[J].ActiveProcessorMask;
+        Groups.push_back(G);
+      }
+    };
+
+    if (!IterateProcInfo(RelationGroup, HandleGroup))
+      return std::vector<ProcessorGroup>();
+
+    auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
+      PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor;
+      assert(El.GroupCount == 1);
+      unsigned NumHyperThreads = 1;
+      // If the flag is set, each core supports more than one hyper-thread.
+      if (El.Flags & LTP_PC_SMT)
+        NumHyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count();
+      unsigned I = El.GroupMask[0].Group;
+      Groups[I].ThreadsPerCore = NumHyperThreads;
+    };
+
+    if (!IterateProcInfo(RelationProcessorCore, HandleProc))
+      return std::vector<ProcessorGroup>();
+
+    // If there's an affinity mask set on one of the CPUs, then assume the user
+    // wants to constrain the current process to only a single CPU.
+    for (auto &G : Groups) {
+      if (G.UsableThreads != G.AllThreads) {
+        ProcessorGroup NewG{G};
+        Groups.clear();
+        Groups.push_back(NewG);
+        break;
+      }
+    }
+
+    return std::vector<ProcessorGroup>(Groups.begin(), Groups.end());
+  };
+  static auto Groups = computeGroups();
+  return ArrayRef<ProcessorGroup>(Groups);
+}
+
+template <typename R, typename UnaryPredicate>
+static unsigned aggregate(R &&Range, UnaryPredicate P) {
+  unsigned I{};
+  for (const auto &It : Range)
+    I += P(It);
+  return I;
+}
+
+// for sys::getHostNumPhysicalCores
+int computeHostNumPhysicalCores() {
+  static unsigned Cores =
+      aggregate(getProcessorGroups(), [](const ProcessorGroup &G) {
+        return G.UsableThreads / G.ThreadsPerCore;
+      });
+  return Cores;
+}
+
+int computeHostNumHardwareThreads() {
+  static unsigned Threads =
+      aggregate(getProcessorGroups(),
+                [](const ProcessorGroup &G) { return G.UsableThreads; });
+  return Threads;
+}
+
+// Assign the current thread to a more appropriate CPU socket or CPU group
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+    unsigned ThreadPoolNum) const {
+  ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
+
+  assert(ThreadPoolNum < compute_thread_count() &&
+         "The thread index is not within thread strategy's range!");
+
+  // In this mode, the ThreadNumber represents the core number, not the
+  // hyper-thread number. Assumes all NUMA groups have the same amount of
+  // hyper-threads.
+  if (!UseHyperThreads)
+    ThreadPoolNum *= Groups[0].ThreadsPerCore;
+
+  unsigned ThreadRangeStart = 0;
+  for (unsigned I = 0; I < Groups.size(); ++I) {
+    const ProcessorGroup &G = Groups[I];
+    if (ThreadPoolNum >= ThreadRangeStart &&
+        ThreadPoolNum < ThreadRangeStart + G.UsableThreads) {
+
+      GROUP_AFFINITY Affinity{};
+      Affinity.Group = G.ID;
+      Affinity.Mask = G.Affinity;
+      SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
+    }
+    ThreadRangeStart += G.UsableThreads;
+  }
+}
+
+llvm::BitVector llvm::get_thread_affinity_mask() {
+  GROUP_AFFINITY Affinity{};
+  GetThreadGroupAffinity(GetCurrentThread(), &Affinity);
+
+  static unsigned All =
+      aggregate(getProcessorGroups(),
+                [](const ProcessorGroup &G) { return G.AllThreads; });
+
+  unsigned StartOffset =
+      aggregate(getProcessorGroups(), [&](const ProcessorGroup &G) {
+        return G.ID < Affinity.Group ? G.AllThreads : 0;
+      });
+
+  llvm::BitVector V;
+  V.resize(All);
+  for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) {
+    if ((Affinity.Mask >> I) & 1)
+      V.set(StartOffset + I);
+  }
+  return V;
+}
+
+unsigned llvm::get_cpus() { return getProcessorGroups().size(); }
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 68b58b061765b..010cfe544b702 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -429,6 +429,57 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   default:
     break;
 
+  case AArch64::BSPv8i8:
+  case AArch64::BSPv16i8: {
+    Register DstReg = MI.getOperand(0).getReg();
+    if (DstReg == MI.getOperand(3).getReg()) {
+      // Expand to BIT
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
+                                                  : AArch64::BITv16i8))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(3))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(1));
+    } else if (DstReg == MI.getOperand(2).getReg()) {
+      // Expand to BIF
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
+                                                  : AArch64::BIFv16i8))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(3))
+          .add(MI.getOperand(1));
+    } else {
+      // Expand to BSL, use additional move if required
+      if (DstReg == MI.getOperand(1).getReg()) {
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+                                                    : AArch64::BSLv16i8))
+            .add(MI.getOperand(0))
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(2))
+            .add(MI.getOperand(3));
+      } else {
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
+                                                    : AArch64::ORRv16i8))
+            .addReg(DstReg)
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(1));
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+                                                    : AArch64::BSLv16i8))
+            .add(MI.getOperand(0))
+            .addReg(DstReg)
+            .add(MI.getOperand(2))
+            .add(MI.getOperand(3));
+      }
+    }
+    MI.eraseFromParent();
+    return true;
+  }
+
   case AArch64::ADDWrr:
   case AArch64::SUBWrr:
   case AArch64::ADDXrr:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9736a18832c06..a64baa9f5b4d0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1287,7 +1287,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
   case AArch64ISD::BICi:              return "AArch64ISD::BICi";
   case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
-  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
+  case AArch64ISD::BSP:               return "AArch64ISD::BSP";
   case AArch64ISD::NEG:               return "AArch64ISD::NEG";
   case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
   case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
@@ -10229,7 +10229,7 @@ static SDValue tryCombineToBSL(SDNode *N,
       }
 
       if (FoundMatch)
-        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
     }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f664484a88038..52728d5abd557 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -90,9 +90,9 @@ enum NodeType : unsigned {
   BICi,
   ORRi,
 
-  // Vector bit select: similar to ISD::VSELECT but not all bits within an
+  // Vector bitwise select: similar to ISD::VSELECT but not all bits within an
   // element must be identical.
-  BSL,
+  BSP,
 
   // Vector arithmetic negation
   NEG,
@@ -166,7 +166,7 @@ enum NodeType : unsigned {
   // Vector bitwise negation
   NOT,
 
-  // Vector bitwise selection
+  // Vector bitwise insertion
   BIT,
 
   // Compare-and-branch
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index b2fa8a55c252d..43ceec94c98b9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5207,6 +5207,47 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
   let Inst{4-0}   = Rd;
 }
 
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorPseudo<RegisterOperand regtype, list<dag> pattern>
+  : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>,
+    Sched<[WriteV]>;
+
+multiclass SIMDLogicalThreeVectorPseudo<SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorPseudo<V64,
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorPseudo<V128,
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
 // All operand sizes distinguished in the encoding.
 multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
@@ -5427,7 +5468,7 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
 }
 
 multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
-                                  string asm, SDPatternOperator OpNode> {
+                                  string asm, SDPatternOperator OpNode = null_frag> {
   def v8i8  : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
                                      asm, ".8b",
              [(set (v8i8 V64:$dst),
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 67c7039e46795..de92bd37b5042 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -468,7 +468,7 @@ def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
 
 def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
 def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
-def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>;
 
 def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
 def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
@@ -3955,33 +3955,36 @@ defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>;
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
                                   BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
-defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
-defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
-defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
-    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
 defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
 defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
                                   BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
 defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
 
-
-def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+// Pseudo bitwise select pattern BSP.
+// It is expanded into BSL/BIT/BIF after register allocation.
+defm BSP : SIMDLogicalThreeVectorPseudo<TriOpFrag<(or (and node:$LHS, node:$MHS),
+                                                      (and (vnot node:$LHS), node:$RHS))>>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif">;
+
+def : Pat<(AArch64bsp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
 def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
                 (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td
index 9f566d1c7079b..19ff13524fa8d 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -501,7 +501,7 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
 //   Q form - v16i8, v8i16, v4i32, v2i64
 
 // ASIMD bitwise insert, Q-form
-def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>;
+def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>;
 
 // ASIMD duplicate, gen reg, D-form and Q-form
 def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
index 798ecb7508c08..a79155dc06fb9 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -494,7 +494,7 @@ def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
 // WriteV includes:
 // SHLL,SSHLL,USHLL
 // SLI,SRI
-// BIF,BIT,BSL
+// BIF,BIT,BSL,BSP
 // EXT
 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
 // XTN2
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
index d1734c455b2b4..08f562c1eaac7 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -660,7 +660,7 @@ def : InstRW<[M3WriteNEONY],  (instrs FSQRTv2f64)>;
 
 // ASIMD miscellaneous instructions.
 def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>;
-def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>;
 def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>;
 def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
index d2284f9fa0b50..ade4493545e1f 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -803,7 +803,7 @@ def : InstRW<[M4WriteNEONY],  (instrs FSQRTv2f64)>;
 
 // ASIMD miscellaneous instructions.
 def : InstRW<[M4WriteNALU1],  (instregex "^RBITv")>;
-def : InstRW<[M4WriteNALU1],  (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M4WriteNALU1],  (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M4WriteNALU1],  (instregex "^CL[STZ]v")>;
 def : InstRW<[M4WriteNEONB],  (instregex "^DUPv.+gpr")>;
 def : InstRW<[M4WriteNSHF1],  (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
index df7402591e7b9..cfc5dfc9f49f1 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -841,7 +841,7 @@ def : InstRW<[M5WriteNEONY],  (instrs FSQRTv2f64)>;
 
 // ASIMD miscellaneous instructions.
 def : InstRW<[M5WriteNALU2],  (instregex "^RBITv")>;
-def : InstRW<[M5WriteNALU2],  (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M5WriteNALU2],  (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M5WriteNALU2],  (instregex "^CL[STZ]v")>;
 def : InstRW<[M5WriteNEONB],  (instregex "^DUPv.+gpr")>;
 def : InstRW<[M5WriteNSHF2],  (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 697a0f69c58cb..f2cd83caffa2b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -911,7 +911,7 @@ def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^CPY(i8|i16|i32|i64)$")>;
 def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^INSv(i8|i16)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^(S|U)MOVv.*$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs EXTv8i8)>;
 def : InstRW<[FalkorWr_1VXVY_0cyc],   (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs TBLv8i8One)>;
@@ -935,7 +935,7 @@ def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
 def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],
                                       (instregex "^INSv(i32|i64)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_2GTOV_1cyc],   (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
-def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v16i8$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIF|BIT|BSL|BSP)v16i8$")>;
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs EXTv16i8)>;
 def : InstRW<[FalkorWr_2VXVY_0cyc],   (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs NOTv16i8)>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
index 4c60992e6351a..bc5ad0f8beced 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -462,13 +462,13 @@ def KryoWrite_1cyc_X_noRSV_74ln :
 	let Latency = 1; let NumMicroOps = 2;
 }
 def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln],
-	(instrs BIFv8i8, BITv8i8, BSLv8i8)>;
+	(instrs BIFv8i8, BITv8i8, BSLv8i8, BSPv8i8)>;
 def KryoWrite_1cyc_X_X_75ln :
 	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
 	let Latency = 1; let NumMicroOps = 2;
 }
 def : InstRW<[KryoWrite_1cyc_X_X_75ln],
-	(instrs BIFv16i8, BITv16i8, BSLv16i8)>;
+	(instrs BIFv16i8, BITv16i8, BSLv16i8, BSPv16i8)>;
 def KryoWrite_0cyc_noRSV_11ln :
 	SchedWriteRes<[]> {
 	let Latency = 0; let NumMicroOps = 1;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index e2a293c068774..40738976bdaa2 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -1482,7 +1482,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>;
 // ASIMD bitwise insert, D-form
 // ASIMD bitwise insert, Q-form
 def : InstRW<[THX2T99Write_5Cyc_F01],
-            (instregex "^BIFv", "^BITv", "^BSLv")>;
+            (instregex "^BIFv", "^BITv", "^BSLv", "^BSPv")>;
 
 // ASIMD count, D-form
 // ASIMD count, Q-form
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8840b0a180c09..49d5fbbbc1268 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -63,6 +64,21 @@ static cl::opt<bool> UseMul24Intrin(
   cl::ReallyHidden,
   cl::init(true));
 
+// Legalize 64-bit division by using the generic IR expansion.
+static cl::opt<bool> ExpandDiv64InIR(
+  "amdgpu-codegenprepare-expand-div64",
+  cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(false));
+
+// Leave all division operations as they are. This supersedes ExpandDiv64InIR
+// and is used for testing the legalizer.
+static cl::opt<bool> DisableIDivExpand(
+  "amdgpu-codegenprepare-disable-idiv-expansion",
+  cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(false));
+
 class AMDGPUCodeGenPrepare : public FunctionPass,
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
   const GCNSubtarget *ST = nullptr;
@@ -160,16 +176,27 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
 
   bool divHasSpecialOptimization(BinaryOperator &I,
                                  Value *Num, Value *Den) const;
+  int getDivNumBits(BinaryOperator &I,
+                    Value *Num, Value *Den,
+                    unsigned AtLeast, bool Signed) const;
 
   /// Expands 24 bit div or rem.
   Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
                         Value *Num, Value *Den,
                         bool IsDiv, bool IsSigned) const;
 
+  Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
+                            Value *Num, Value *Den, unsigned NumBits,
+                            bool IsDiv, bool IsSigned) const;
+
   /// Expands 32 bit div or rem.
   Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
                         Value *Num, Value *Den) const;
 
+  Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
+                        Value *Num, Value *Den) const;
+  void expandDivRem64(BinaryOperator &I) const;
+
   /// Widen a scalar load.
   ///
   /// \details \p Widen scalar load for uniform, small type loads from constant
@@ -806,30 +833,49 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
   return getMul64(Builder, LHS, RHS).second;
 }
 
-// The fractional part of a float is enough to accurately represent up to
-// a 24-bit signed integer.
-Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
-                                            BinaryOperator &I,
-                                            Value *Num, Value *Den,
-                                            bool IsDiv, bool IsSigned) const {
-  assert(Num->getType()->isIntegerTy(32));
-
+/// Figure out how many bits are really needed for this ddivision. \p AtLeast is
+/// an optimization hint to bypass the second ComputeNumSignBits call if we the
+/// first one is insufficient. Returns -1 on failure.
+int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
+                                        Value *Num, Value *Den,
+                                        unsigned AtLeast, bool IsSigned) const {
   const DataLayout &DL = Mod->getDataLayout();
   unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
-  if (LHSSignBits < 9)
-    return nullptr;
+  if (LHSSignBits < AtLeast)
+    return -1;
 
   unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
-  if (RHSSignBits < 9)
-    return nullptr;
-
+  if (RHSSignBits < AtLeast)
+    return -1;
 
   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
-  unsigned DivBits = 32 - SignBits;
+  unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
   if (IsSigned)
     ++DivBits;
+  return DivBits;
+}
 
+// The fractional part of a float is enough to accurately represent up to
+// a 24-bit signed integer.
+Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
+                                            BinaryOperator &I,
+                                            Value *Num, Value *Den,
+                                            bool IsDiv, bool IsSigned) const {
+  int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
+  if (DivBits == -1)
+    return nullptr;
+  return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
+}
+
+Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
+                                                BinaryOperator &I,
+                                                Value *Num, Value *Den,
+                                                unsigned DivBits,
+                                                bool IsDiv, bool IsSigned) const {
   Type *I32Ty = Builder.getInt32Ty();
+  Num = Builder.CreateTrunc(Num, I32Ty);
+  Den = Builder.CreateTrunc(Den, I32Ty);
+
   Type *F32Ty = Builder.getFloatTy();
   ConstantInt *One = Builder.getInt32(1);
   Value *JQ = One;
@@ -901,13 +947,18 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
     Res = Builder.CreateSub(Num, Rem);
   }
 
-  // Extend in register from the number of bits this divide really is.
-  if (IsSigned) {
-    Res = Builder.CreateShl(Res, 32 - DivBits);
-    Res = Builder.CreateAShr(Res, 32 - DivBits);
-  } else {
-    ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
-    Res = Builder.CreateAnd(Res, TruncMask);
+  if (DivBits != 0 && DivBits < 32) {
+    // Extend in register from the number of bits this divide really is.
+    if (IsSigned) {
+      int InRegBits = 32 - DivBits;
+
+      Res = Builder.CreateShl(Res, InRegBits);
+      Res = Builder.CreateAShr(Res, InRegBits);
+    } else {
+      ConstantInt *TruncMask
+        = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
+      Res = Builder.CreateAnd(Res, TruncMask);
+    }
   }
 
   return Res;
@@ -981,8 +1032,8 @@ Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
   }
 
   if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
-    Res = Builder.CreateTrunc(Res, Ty);
-    return Res;
+    return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
+                      Builder.CreateZExtOrTrunc(Res, Ty);
   }
 
   ConstantInt *Zero = Builder.getInt32(0);
@@ -1093,6 +1144,53 @@ Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
   return Res;
 }
 
+Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
+                                            BinaryOperator &I,
+                                            Value *Num, Value *Den) const {
+  if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
+    return nullptr;  // Keep it for later optimization.
+
+  Instruction::BinaryOps Opc = I.getOpcode();
+
+  bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
+  bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
+
+  int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
+  if (NumDivBits == -1)
+    return nullptr;
+
+  Value *Narrowed = nullptr;
+  if (NumDivBits <= 24) {
+    Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
+                                  IsDiv, IsSigned);
+  } else if (NumDivBits <= 32) {
+    Narrowed = expandDivRem32(Builder, I, Num, Den);
+  }
+
+  if (Narrowed) {
+    return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
+                      Builder.CreateZExt(Narrowed, Num->getType());
+  }
+
+  return nullptr;
+}
+
+void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
+  Instruction::BinaryOps Opc = I.getOpcode();
+  // Do the general expansion.
+  if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
+    expandDivisionUpTo64Bits(&I);
+    return;
+  }
+
+  if (Opc == Instruction::URem || Opc == Instruction::SRem) {
+    expandRemainderUpTo64Bits(&I);
+    return;
+  }
+
+  llvm_unreachable("not a division");
+}
+
 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
   if (foldBinOpIntoSelect(I))
     return true;
@@ -1108,9 +1206,14 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
   Instruction::BinaryOps Opc = I.getOpcode();
   Type *Ty = I.getType();
   Value *NewDiv = nullptr;
+  unsigned ScalarSize = Ty->getScalarSizeInBits();
+
+  SmallVector<BinaryOperator *, 8> Div64ToExpand;
+
   if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
        Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
-      Ty->getScalarSizeInBits() <= 32) {
+      ScalarSize <= 64 &&
+      !DisableIDivExpand) {
     Value *Num = I.getOperand(0);
     Value *Den = I.getOperand(1);
     IRBuilder<> Builder(&I);
@@ -1122,13 +1225,35 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
       for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
         Value *NumEltN = Builder.CreateExtractElement(Num, N);
         Value *DenEltN = Builder.CreateExtractElement(Den, N);
-        Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
-        if (!NewElt)
-          NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+
+        Value *NewElt;
+        if (ScalarSize <= 32) {
+          NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
+          if (!NewElt)
+            NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+        } else {
+          // See if this 64-bit division can be shrunk to 32/24-bits before
+          // producing the general expansion.
+          NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
+          if (!NewElt) {
+            // The general 64-bit expansion introduces control flow and doesn't
+            // return the new value. Just insert a scalar copy and defer
+            // expanding it.
+            NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+            Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
+          }
+        }
+
         NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
       }
     } else {
-      NewDiv = expandDivRem32(Builder, I, Num, Den);
+      if (ScalarSize <= 32)
+        NewDiv = expandDivRem32(Builder, I, Num, Den);
+      else {
+        NewDiv = shrinkDivRem64(Builder, I, Num, Den);
+        if (!NewDiv)
+          Div64ToExpand.push_back(&I);
+      }
     }
 
     if (NewDiv) {
@@ -1138,6 +1263,14 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
     }
   }
 
+  if (ExpandDiv64InIR) {
+    // TODO: We get much worse code in specially handled constant cases.
+    for (BinaryOperator *Div : Div64ToExpand) {
+      expandDivRem64(*Div);
+      Changed = true;
+    }
+  }
+
   return Changed;
 }
 
@@ -1255,11 +1388,25 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
 
   bool MadeChange = false;
 
-  for (BasicBlock &BB : F) {
+  Function::iterator NextBB;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
+    BasicBlock *BB = &*FI;
+    NextBB = std::next(FI);
+
     BasicBlock::iterator Next;
-    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
       Next = std::next(I);
+
       MadeChange |= visit(*I);
+
+      if (Next != E) { // Control flow changed
+        BasicBlock *NextInstBB = Next->getParent();
+        if (NextInstBB != BB) {
+          BB = NextInstBB;
+          E = BB->end();
+          FE = F.end();
+        }
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 3dcef2f2415af..f8fee8621a519 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -143,6 +143,7 @@ def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>;
 def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32_impl>;
 def : GINodeEquiv<G_AMDGPU_FMIN_LEGACY, AMDGPUfmin_legacy>;
 def : GINodeEquiv<G_AMDGPU_FMAX_LEGACY, AMDGPUfmax_legacy>;
+def : GINodeEquiv<G_AMDGPU_RCP_IFLAG, AMDGPUrcp_iflag>;
 
 def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index be98f74de9536..1cf95c5b522f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -436,7 +436,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   getActionDefinitionsBuilder(G_FPTRUNC)
     .legalFor({{S32, S64}, {S16, S32}})
-    .scalarize(0);
+    .scalarize(0)
+    .lower();
 
   getActionDefinitionsBuilder(G_FPEXT)
     .legalFor({{S64, S32}, {S32, S16}})
@@ -597,7 +598,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .widenScalarToNextPow2(0, 32)
     .widenScalarToNextPow2(1, 32);
 
-  // TODO: Expand for > s32
   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
     .legalFor({S32})
     .clampScalar(0, S32, S32)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 59151a3346e61..22528e243a4f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3096,9 +3096,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_FMAXNUM_IEEE:
   case AMDGPU::G_FCANONICALIZE:
   case AMDGPU::G_INTRINSIC_TRUNC:
+  case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
   case AMDGPU::G_AMDGPU_FFBH_U32:
   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+  case AMDGPU::G_AMDGPU_RCP_IFLAG:
     return getDefaultMappingVOP(MI);
   case AMDGPU::G_UMULH:
   case AMDGPU::G_SMULH: {
@@ -3182,7 +3184,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_BITCAST:
   case AMDGPU::G_INTTOPTR:
   case AMDGPU::G_PTRTOINT:
-  case AMDGPU::G_BSWAP:
   case AMDGPU::G_BITREVERSE:
   case AMDGPU::G_FABS:
   case AMDGPU::G_FNEG: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 31c06ce0bfbfb..fb488d2b1aab1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -150,7 +150,9 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
 
   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 
-  if (DoesNotSupportXNACK && EnableXNACK) {
+  // Disable XNACK on targets where it is not enabled by default unless it is
+  // explicitly requested.
+  if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
     ToggleFeature(AMDGPU::FeatureXNACK);
     EnableXNACK = false;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e9679cdf95978..d250af225345f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -365,6 +365,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   // FIXME: This should be narrowed to i32, but that only happens if i64 is
   // illegal.
+  // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
   setOperationAction(ISD::BSWAP, MVT::i64, Legal);
   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
 
@@ -467,7 +468,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SREM, MVT::i16, Promote);
     setOperationAction(ISD::UREM, MVT::i16, Promote);
 
-    setOperationAction(ISD::BSWAP, MVT::i16, Promote);
     setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
 
     setOperationAction(ISD::CTTZ, MVT::i16, Promote);
@@ -549,6 +549,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       }
     }
 
+    // v_perm_b32 can handle either of these.
+    setOperationAction(ISD::BSWAP, MVT::i16, Legal);
+    setOperationAction(ISD::BSWAP, MVT::v2i16, Legal);
+    setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
+
     // XXX - Do these do anything? Vector constants turn into build_vector.
     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
@@ -3909,7 +3914,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
                                              SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  assert(VT == MVT::v4f16);
+  assert(VT == MVT::v4f16 || VT == MVT::v4i16);
 
   SDValue Lo, Hi;
   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4018,6 +4023,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FABS:
   case ISD::FNEG:
   case ISD::FCANONICALIZE:
+  case ISD::BSWAP:
     return splitUnaryVectorOp(Op, DAG);
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9dd51bf4a27d9..beab2eb205fba 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1782,8 +1782,8 @@ def : GCNPat <
 def : GCNPat <
   (i32 (bswap i32:$a)),
   (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
-             (V_ALIGNBIT_B32 $a, $a, (i32 24)),
-             (V_ALIGNBIT_B32 $a, $a, (i32 8)))
+             (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
+             (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
 >;
 
 // FIXME: This should have been narrowed to i32 during legalization.
@@ -1809,8 +1809,9 @@ def : GCNPat <
   sub1)
 >;
 
-
-let SubtargetPredicate = isGFX8Plus in {
+// FIXME: The AddedComplexity should not be needed, but in GlobalISel
+// the BFI pattern ends up taking precedence without it.
+let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
 // Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24)
 //
 // My reading of the manual suggests we should be using src0 for the
@@ -1833,6 +1834,24 @@ def : GCNPat <
   sub1)
 >;
 
+// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
+// The 12s emit 0s.
+def : GCNPat <
+  (i16 (bswap i16:$a)),
+  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
+>;
+
+def : GCNPat <
+  (i32 (zext (bswap i16:$a))),
+  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
+>;
+
+// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
+def : GCNPat <
+  (v2i16 (bswap v2i16:$a)),
+  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
+>;
+
 }
 
 let OtherPredicates = [NoFP16Denormals] in {
@@ -2194,6 +2213,12 @@ def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = 0;
+}
+
 class BufferLoadGenericInstruction : AMDGPUGenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 7089ba2f77240..bef2fb349741c 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -1294,9 +1294,28 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
                                          SMLoc IDLoc) {
   MCContext &Context = getParser().getContext();
   const MCRegisterInfo *RI = getContext().getRegisterInfo();
-  std::string r = "r";
-  std::string v = "v";
-  std::string Colon = ":";
+  const std::string r = "r";
+  const std::string v = "v";
+  const std::string Colon = ":";
+  using RegPairVals = std::pair<unsigned, unsigned>;
+  auto GetRegPair = [this, r](RegPairVals RegPair) {
+    const std::string R1 = r + utostr(RegPair.first);
+    const std::string R2 = r + utostr(RegPair.second);
+
+    return std::make_pair(matchRegister(R1), matchRegister(R2));
+  };
+  auto GetScalarRegs = [RI, GetRegPair](unsigned RegPair) {
+    const unsigned Lower = RI->getEncodingValue(RegPair);
+    const RegPairVals RegPair_ = std::make_pair(Lower + 1, Lower);
+
+    return GetRegPair(RegPair_);
+  };
+  auto GetVecRegs = [GetRegPair](unsigned VecRegPair) {
+    const RegPairVals RegPair =
+        HexagonMCInstrInfo::GetVecRegPairIndices(VecRegPair);
+
+    return GetRegPair(RegPair);
+  };
 
   bool is32bit = false; // used to distinguish between CONST32 and CONST64
   switch (Inst.getOpcode()) {
@@ -1388,14 +1407,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
   case Hexagon::A2_tfrp: {
     MCOperand &MO = Inst.getOperand(1);
-    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = r + utostr(RegPairNum + 1);
-    StringRef Reg1(R1);
-    MO.setReg(matchRegister(Reg1));
-    // Add a new operand for the second register in the pair.
-    std::string R2 = r + utostr(RegPairNum);
-    StringRef Reg2(R2);
-    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+    const std::pair<unsigned, unsigned> RegPair = GetScalarRegs(MO.getReg());
+    MO.setReg(RegPair.first);
+    Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode(Hexagon::A2_combinew);
     break;
   }
@@ -1403,14 +1417,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   case Hexagon::A2_tfrpt:
   case Hexagon::A2_tfrpf: {
     MCOperand &MO = Inst.getOperand(2);
-    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = r + utostr(RegPairNum + 1);
-    StringRef Reg1(R1);
-    MO.setReg(matchRegister(Reg1));
-    // Add a new operand for the second register in the pair.
-    std::string R2 = r + utostr(RegPairNum);
-    StringRef Reg2(R2);
-    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+    const std::pair<unsigned, unsigned> RegPair = GetScalarRegs(MO.getReg());
+    MO.setReg(RegPair.first);
+    Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
                        ? Hexagon::C2_ccombinewt
                        : Hexagon::C2_ccombinewf);
@@ -1419,14 +1428,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   case Hexagon::A2_tfrptnew:
   case Hexagon::A2_tfrpfnew: {
     MCOperand &MO = Inst.getOperand(2);
-    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = r + utostr(RegPairNum + 1);
-    StringRef Reg1(R1);
-    MO.setReg(matchRegister(Reg1));
-    // Add a new operand for the second register in the pair.
-    std::string R2 = r + utostr(RegPairNum);
-    StringRef Reg2(R2);
-    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+    const std::pair<unsigned, unsigned> RegPair = GetScalarRegs(MO.getReg());
+    MO.setReg(RegPair.first);
+    Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
                        ? Hexagon::C2_ccombinewnewt
                        : Hexagon::C2_ccombinewnewf);
@@ -1436,12 +1440,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   // Translate a "$Vdd = $Vss" to "$Vdd = vcombine($Vs, $Vt)"
   case Hexagon::V6_vassignp: {
     MCOperand &MO = Inst.getOperand(1);
-    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = v + utostr(RegPairNum + 1);
-    MO.setReg(MatchRegisterName(R1));
-    // Add a new operand for the second register in the pair.
-    std::string R2 = v + utostr(RegPairNum);
-    Inst.addOperand(MCOperand::createReg(MatchRegisterName(R2)));
+    const std::pair<unsigned, unsigned> RegPair = GetVecRegs(MO.getReg());
+    MO.setReg(RegPair.first);
+    Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode(Hexagon::V6_vcombine);
     break;
   }
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index d71409de5e356..f3a87ef20a608 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -498,9 +498,13 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
     } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) {
       unsigned Producer =
           HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg();
-      if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
-        Producer = ((Producer - Hexagon::W0) << 1) + SubregBit + Hexagon::V0;
-      else if (SubregBit)
+
+      if (HexagonMCInstrInfo::IsVecRegPair(Producer)) {
+        const bool Rev = HexagonMCInstrInfo::IsReverseVecRegPair(Producer);
+        const unsigned ProdPairIndex =
+            Rev ? Producer - Hexagon::WR0 : Producer - Hexagon::W0;
+        Producer = (ProdPairIndex << 1) + SubregBit + Hexagon::V0;
+      } else if (SubregBit)
         // Hexagon PRM 10.11 New-value operands
         // Nt[0] is reserved and should always be encoded as zero.
         return MCDisassembler::Fail;
@@ -606,12 +610,16 @@ static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t /*Address*/,
                                              const void *Decoder) {
   static const MCPhysReg HvxWRDecoderTable[] = {
-      Hexagon::W0,  Hexagon::W1,  Hexagon::W2,  Hexagon::W3,
-      Hexagon::W4,  Hexagon::W5,  Hexagon::W6,  Hexagon::W7,
-      Hexagon::W8,  Hexagon::W9,  Hexagon::W10, Hexagon::W11,
-      Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15};
+      Hexagon::W0,   Hexagon::WR0,  Hexagon::W1,   Hexagon::WR1,  Hexagon::W2,
+      Hexagon::WR2,  Hexagon::W3,   Hexagon::WR3,  Hexagon::W4,   Hexagon::WR4,
+      Hexagon::W5,   Hexagon::WR5,  Hexagon::W6,   Hexagon::WR6,  Hexagon::W7,
+      Hexagon::WR7,  Hexagon::W8,   Hexagon::WR8,  Hexagon::W9,   Hexagon::WR9,
+      Hexagon::W10,  Hexagon::WR10, Hexagon::W11,  Hexagon::WR11, Hexagon::W12,
+      Hexagon::WR12, Hexagon::W13,  Hexagon::WR13, Hexagon::W14,  Hexagon::WR14,
+      Hexagon::W15,  Hexagon::WR15,
+  };
 
-  return (DecodeRegisterClass(Inst, RegNo >> 1, HvxWRDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo, HvxWRDecoderTable);
 }
 
 LLVM_ATTRIBUTE_UNUSED  // Suppress warning temporarily.
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index d55aeaf10852d..2cb3f7c6573e0 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -172,6 +172,13 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
   Reserved.set(Hexagon::C8);
   Reserved.set(Hexagon::USR_OVF);
 
+  // Leveraging these registers will require more work to recognize
+  // the new semantics posed, Hi/LoVec patterns, etc.
+  // Note well: if enabled, they should be restricted to only
+  // where `HST.useHVXOps() && HST.hasV67Ops()` is true.
+  for (auto Reg : Hexagon_MC::GetVectRegRev())
+    Reserved.set(Reg);
+
   if (MF.getSubtarget<HexagonSubtarget>().hasReservedR19())
     Reserved.set(Hexagon::R19);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
index c23b837bb62fc..ea39dc44d15be 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -18,6 +18,12 @@ let Namespace = "Hexagon" in {
     let HWEncoding{4-0} = num;
   }
 
+  // These registers are used to preserve a distinction between
+  // vector register pairs of differing order.
+  class HexagonFakeReg<string n> : Register<n> {
+    let isArtificial = 1;
+  }
+
   class HexagonDoubleReg<bits<5> num, string n, list<Register> subregs,
                          list<string> alt = []> :
         RegisterWithSubRegs<n, subregs> {
@@ -30,6 +36,13 @@ let Namespace = "Hexagon" in {
   class Ri<bits<5> num, string n, list<string> alt = []> :
         HexagonReg<num, n, alt>;
 
+  // Rp - false/pseudo registers.  These registers are used
+  // to provide a distinct set of aliases for both styles of vector
+  // register pairs without encountering subregister indexing constraints.
+  class R_fake<string n> :
+        HexagonFakeReg<n>;
+
+
   // Rf - 32-bit floating-point registers.
   class Rf<bits<5> num, string n> : HexagonReg<num, n>;
 
@@ -81,6 +94,7 @@ let Namespace = "Hexagon" in {
   def isub_hi  : SubRegIndex<32, 32>;
   def vsub_lo  : SubRegIndex<512>;
   def vsub_hi  : SubRegIndex<512, 512>;
+  def vsub_fake: SubRegIndex<512>;
   def wsub_lo  : SubRegIndex<1024>;
   def wsub_hi  : SubRegIndex<1024, 1024>;
   def subreg_overflow : SubRegIndex<1, 0>;
@@ -183,27 +197,49 @@ let Namespace = "Hexagon" in {
 
   foreach i = 0-31 in {
     def V#i  : Ri<i, "v"#i>,  DwarfRegNum<[!add(i, 99)]>;
+    def VF#i : R_fake<"__"#!add(i,999999)>,  DwarfRegNum<[!add(i, 999999)]>;
+    def VFR#i : R_fake<"__"#!add(i,9999999)>,  DwarfRegNum<[!add(i, 9999999)]>;
   }
   def VTMP : Ri<0, "vtmp">, DwarfRegNum<[131]>;
 
   // Aliases of the V* registers used to hold double vec values.
-  let SubRegIndices = [vsub_lo, vsub_hi], CoveredBySubRegs = 1 in {
-  def W0  : Rd< 0,  "v1:0",  [V0,  V1]>,  DwarfRegNum<[99]>;
-  def W1  : Rd< 2,  "v3:2",  [V2,  V3]>,  DwarfRegNum<[101]>;
-  def W2  : Rd< 4,  "v5:4",  [V4,  V5]>,  DwarfRegNum<[103]>;
-  def W3  : Rd< 6,  "v7:6",  [V6,  V7]>,  DwarfRegNum<[105]>;
-  def W4  : Rd< 8,  "v9:8",  [V8,  V9]>,  DwarfRegNum<[107]>;
-  def W5  : Rd<10, "v11:10", [V10, V11]>, DwarfRegNum<[109]>;
-  def W6  : Rd<12, "v13:12", [V12, V13]>, DwarfRegNum<[111]>;
-  def W7  : Rd<14, "v15:14", [V14, V15]>, DwarfRegNum<[113]>;
-  def W8  : Rd<16, "v17:16", [V16, V17]>, DwarfRegNum<[115]>;
-  def W9  : Rd<18, "v19:18", [V18, V19]>, DwarfRegNum<[117]>;
-  def W10 : Rd<20, "v21:20", [V20, V21]>, DwarfRegNum<[119]>;
-  def W11 : Rd<22, "v23:22", [V22, V23]>, DwarfRegNum<[121]>;
-  def W12 : Rd<24, "v25:24", [V24, V25]>, DwarfRegNum<[123]>;
-  def W13 : Rd<26, "v27:26", [V26, V27]>, DwarfRegNum<[125]>;
-  def W14 : Rd<28, "v29:28", [V28, V29]>, DwarfRegNum<[127]>;
-  def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>;
+  let SubRegIndices = [vsub_lo, vsub_hi, vsub_fake], CoveredBySubRegs = 1 in {
+  def W0  : Rd< 0,  "v1:0",  [V0,  V1, VF0]>,  DwarfRegNum<[99]>;
+  def W1  : Rd< 2,  "v3:2",  [V2,  V3, VF1]>,  DwarfRegNum<[101]>;
+  def W2  : Rd< 4,  "v5:4",  [V4,  V5, VF2]>,  DwarfRegNum<[103]>;
+  def W3  : Rd< 6,  "v7:6",  [V6,  V7, VF3]>,  DwarfRegNum<[105]>;
+  def W4  : Rd< 8,  "v9:8",  [V8,  V9, VF4]>,  DwarfRegNum<[107]>;
+  def W5  : Rd<10, "v11:10", [V10, V11, VF5]>, DwarfRegNum<[109]>;
+  def W6  : Rd<12, "v13:12", [V12, V13, VF6]>, DwarfRegNum<[111]>;
+  def W7  : Rd<14, "v15:14", [V14, V15, VF7]>, DwarfRegNum<[113]>;
+  def W8  : Rd<16, "v17:16", [V16, V17, VF8]>, DwarfRegNum<[115]>;
+  def W9  : Rd<18, "v19:18", [V18, V19, VF9]>, DwarfRegNum<[117]>;
+  def W10 : Rd<20, "v21:20", [V20, V21, VF10]>, DwarfRegNum<[119]>;
+  def W11 : Rd<22, "v23:22", [V22, V23, VF11]>, DwarfRegNum<[121]>;
+  def W12 : Rd<24, "v25:24", [V24, V25, VF12]>, DwarfRegNum<[123]>;
+  def W13 : Rd<26, "v27:26", [V26, V27, VF13]>, DwarfRegNum<[125]>;
+  def W14 : Rd<28, "v29:28", [V28, V29, VF14]>, DwarfRegNum<[127]>;
+  def W15 : Rd<30, "v31:30", [V30, V31, VF15]>, DwarfRegNum<[129]>;
+  }
+
+  // Reverse Aliases of the V* registers used to hold double vec values.
+  let SubRegIndices = [vsub_lo, vsub_hi, vsub_fake], CoveredBySubRegs = 1 in {
+  def WR0 : Rd< 1,  "v0:1",  [V0, V1, VFR0]>,  DwarfRegNum<[161]>;
+  def WR1 : Rd< 3,  "v2:3",  [V2, V3, VFR1]>,  DwarfRegNum<[162]>;
+  def WR2 : Rd< 5,  "v4:5",  [V4, V5, VFR2]>,  DwarfRegNum<[163]>;
+  def WR3 : Rd< 7,  "v6:7",  [V6, V7, VFR3]>,  DwarfRegNum<[164]>;
+  def WR4 : Rd< 9,  "v8:9",  [V8, V9, VFR4]>,  DwarfRegNum<[165]>;
+  def WR5 : Rd<11, "v10:11", [V10, V11, VFR5]>,  DwarfRegNum<[166]>;
+  def WR6 : Rd<13, "v12:13", [V12, V13, VFR6]>,  DwarfRegNum<[167]>;
+  def WR7 : Rd<15, "v14:15", [V14, V15, VFR7]>,  DwarfRegNum<[168]>;
+  def WR8 : Rd<17, "v16:17", [V16, V17, VFR8]>,  DwarfRegNum<[169]>;
+  def WR9 : Rd<19, "v18:19", [V18, V19, VFR9]>,  DwarfRegNum<[170]>;
+  def WR10: Rd<21, "v20:21", [V20, V21, VFR10]>,  DwarfRegNum<[171]>;
+  def WR11: Rd<23, "v22:23", [V22, V23, VFR11]>,  DwarfRegNum<[172]>;
+  def WR12: Rd<25, "v24:25", [V24, V25, VFR12]>,  DwarfRegNum<[173]>;
+  def WR13: Rd<27, "v26:27", [V26, V27, VFR13]>,  DwarfRegNum<[174]>;
+  def WR14: Rd<29, "v28:29", [V28, V29, VFR14]>,  DwarfRegNum<[175]>;
+  def WR15: Rd<31, "v30:31", [V30, V31, VFR15]>,  DwarfRegNum<[176]>;
   }
 
   // Aliases of the V* registers used to hold quad vec values.
@@ -314,7 +350,7 @@ def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32], 512,
 }
 
 def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32], 1024,
-  (add (sequence "W%u", 0, 15))> {
+  (add (sequence "W%u", 0, 15), (sequence "WR%u", 0, 15))> {
   let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
     [RegInfo<1024,1024,1024>, RegInfo<2048,2048,2048>, RegInfo<1024,1024,1024>]>;
 }
@@ -365,6 +401,10 @@ def CtrRegs : RegisterClass<"Hexagon", [i32], 32,
        FRAMELIMIT, FRAMEKEY, PKTCOUNTLO, PKTCOUNTHI, UTIMERLO, UTIMERHI,
        M0, M1, USR)>;
 
+let Size = 64 in
+def VectRegRev : RegisterClass<"Hexagon", [i64], 64,
+  (add (sequence "WR%u", 0, 15))>;
+
 let isAllocatable = 0 in
 def UsrBits : RegisterClass<"Hexagon", [i1], 0, (add USR_OVF)>;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp b/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
index 65a8dcd75bdca..fbc5e5c344eda 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
@@ -71,9 +71,10 @@ class HexagonVectorPrint : public MachineFunctionPass {
 char HexagonVectorPrint::ID = 0;
 
 static bool isVecReg(unsigned Reg) {
-  return (Reg >= Hexagon::V0 && Reg <= Hexagon::V31)
-      || (Reg >= Hexagon::W0 && Reg <= Hexagon::W15)
-      || (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3);
+  return (Reg >= Hexagon::V0 && Reg <= Hexagon::V31) ||
+         (Reg >= Hexagon::W0 && Reg <= Hexagon::W15) ||
+         (Reg >= Hexagon::WR0 && Reg <= Hexagon::WR15) ||
+         (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3);
 }
 
 static std::string getStringReg(unsigned R) {
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 8b262bd0248e0..52c56d6db5242 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -81,6 +81,9 @@ void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg,
       if (!MCSubRegIterator(*SRI, &RI).isValid())
         // Skip super-registers used indirectly.
         Uses.insert(*SRI);
+
+  if (HexagonMCInstrInfo::IsReverseVecRegPair(R))
+    ReversePairs.insert(R);
 }
 
 void HexagonMCChecker::init(MCInst const &MCI) {
@@ -133,6 +136,9 @@ void HexagonMCChecker::init(MCInst const &MCI) {
     if (R == Hexagon::C8)
       R = Hexagon::USR;
 
+    if (HexagonMCInstrInfo::IsReverseVecRegPair(R))
+      ReversePairs.insert(R);
+
     // Note register definitions, direct ones as well as indirect side-effects.
     // Super-registers are not tracked directly, but their components.
     for (MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
@@ -192,7 +198,7 @@ HexagonMCChecker::HexagonMCChecker(MCContext &Context, MCInstrInfo const &MCII,
                                    MCSubtargetInfo const &STI, MCInst &mcb,
                                    MCRegisterInfo const &ri, bool ReportErrors)
     : Context(Context), MCB(mcb), RI(ri), MCII(MCII), STI(STI),
-      ReportErrors(ReportErrors) {
+      ReportErrors(ReportErrors), ReversePairs() {
   init();
 }
 
@@ -200,7 +206,10 @@ HexagonMCChecker::HexagonMCChecker(HexagonMCChecker const &Other,
                                    MCSubtargetInfo const &STI,
                                    bool CopyReportErrors)
     : Context(Other.Context), MCB(Other.MCB), RI(Other.RI), MCII(Other.MCII),
-      STI(STI), ReportErrors(CopyReportErrors ? Other.ReportErrors : false) {}
+      STI(STI), ReportErrors(CopyReportErrors ? Other.ReportErrors : false),
+      ReversePairs() {
+  init();
+}
 
 bool HexagonMCChecker::check(bool FullCheck) {
   bool chkP = checkPredicates();
@@ -218,8 +227,9 @@ bool HexagonMCChecker::check(bool FullCheck) {
   bool chkAXOK = checkAXOK();
   bool chkCofMax1 = checkCOFMax1();
   bool chkHWLoop = checkHWLoop();
+  bool chkLegalVecRegPair = checkLegalVecRegPair();
   bool chk = chkP && chkNV && chkR && chkRRO && chkS && chkSh && chkSl &&
-             chkAXOK && chkCofMax1 && chkHWLoop;
+             chkAXOK && chkCofMax1 && chkHWLoop && chkLegalVecRegPair;
 
   return chk;
 }
@@ -729,3 +739,16 @@ void HexagonMCChecker::reportWarning(Twine const &Msg) {
   if (ReportErrors)
     Context.reportWarning(MCB.getLoc(), Msg);
 }
+
+bool HexagonMCChecker::checkLegalVecRegPair() {
+  const bool IsPermitted = STI.getFeatureBits()[Hexagon::ArchV67];
+  const bool HasReversePairs = ReversePairs.size() != 0;
+
+  if (!IsPermitted && HasReversePairs) {
+    for (auto R : ReversePairs)
+      reportError("register pair `" + Twine(RI.getName(R)) +
+                  "' is not permitted for this architecture");
+    return false;
+  }
+  return true;
+}
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index bc55ade9ccd78..00afdb664ba51 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -72,6 +72,10 @@ class HexagonMCChecker {
   using ReadOnlyIterator = std::set<unsigned>::iterator;
   std::set<unsigned> ReadOnly;
 
+  // Contains the vector-pair-registers with the even number
+  // first ("v0:1", e.g.) used/def'd in this packet.
+  std::set<unsigned> ReversePairs;
+
   void init();
   void init(MCInst const &);
   void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue);
@@ -94,6 +98,7 @@ class HexagonMCChecker {
   bool checkAXOK();
   bool checkHWLoop();
   bool checkCOFMax1();
+  bool checkLegalVecRegPair();
 
   static void compoundRegisterMap(unsigned &);
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 95e23c99868a4..36800b4279437 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -391,15 +391,9 @@ void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
 
 static bool RegisterMatches(unsigned Consumer, unsigned Producer,
                             unsigned Producer2) {
-  if (Consumer == Producer)
-    return true;
-  if (Consumer == Producer2)
-    return true;
-  // Calculate if we're a single vector consumer referencing a double producer
-  if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
-    if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31)
-      return ((Consumer - Hexagon::V0) >> 1) == (Producer - Hexagon::W0);
-  return false;
+  return (Consumer == Producer) || (Consumer == Producer2) ||
+         HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(Producer,
+                                                             Consumer);
 }
 
 /// EncodeSingleInstruction - Emit a single
@@ -735,7 +729,8 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
     unsigned SOffset = 0;
     unsigned VOffset = 0;
     unsigned UseReg = MO.getReg();
-    unsigned DefReg1, DefReg2;
+    unsigned DefReg1 = Hexagon::NoRegister;
+    unsigned DefReg2 = Hexagon::NoRegister;
 
     auto Instrs = HexagonMCInstrInfo::bundleInstructions(*State.Bundle);
     const MCOperand *I = Instrs.begin() + State.Index - 1;
@@ -746,7 +741,8 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
       if (HexagonMCInstrInfo::isImmext(Inst))
         continue;
 
-      DefReg1 = DefReg2 = 0;
+      DefReg1 = Hexagon::NoRegister;
+      DefReg2 = Hexagon::NoRegister;
       ++SOffset;
       if (HexagonMCInstrInfo::isVector(MCII, Inst)) {
         // Vector instructions don't count scalars.
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 4f8a432562196..f9f342a07f6dd 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -676,6 +676,45 @@ bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) {
   return (Flags & outerLoopMask) != 0;
 }
 
+bool HexagonMCInstrInfo::IsVecRegPair(unsigned VecReg) {
+  return (VecReg >= Hexagon::W0 && VecReg <= Hexagon::W15) ||
+         (VecReg >= Hexagon::WR0 && VecReg <= Hexagon::WR15);
+}
+
+bool HexagonMCInstrInfo::IsReverseVecRegPair(unsigned VecReg) {
+  return (VecReg >= Hexagon::WR0 && VecReg <= Hexagon::WR15);
+}
+
+bool HexagonMCInstrInfo::IsVecRegSingle(unsigned VecReg) {
+  return (VecReg >= Hexagon::V0 && VecReg <= Hexagon::V31);
+}
+
+std::pair<unsigned, unsigned>
+HexagonMCInstrInfo::GetVecRegPairIndices(unsigned VecRegPair) {
+  assert(IsVecRegPair(VecRegPair) &&
+         "VecRegPair must be a vector register pair");
+
+  const bool IsRev = IsReverseVecRegPair(VecRegPair);
+  const unsigned PairIndex =
+      2 * (IsRev ? VecRegPair - Hexagon::WR0 : VecRegPair - Hexagon::W0);
+
+  return IsRev ? std::make_pair(PairIndex, PairIndex + 1)
+               : std::make_pair(PairIndex + 1, PairIndex);
+}
+
+bool HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(unsigned Producer,
+                                                         unsigned Consumer) {
+  if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer)) {
+    const unsigned ProdPairIndex = IsReverseVecRegPair(Producer)
+                                       ? Producer - Hexagon::WR0
+                                       : Producer - Hexagon::W0;
+    const unsigned ConsumerSingleIndex = (Consumer - Hexagon::V0) >> 1;
+
+    return ConsumerSingleIndex == ProdPairIndex;
+  }
+  return false;
+}
+
 bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII,
                                       MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -971,9 +1010,8 @@ unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer,
                                             unsigned Producer2) {
   // If we're a single vector consumer of a double producer, set subreg bit
   // based on if we're accessing the lower or upper register component
-  if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
-    if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31)
-      return (Consumer - Hexagon::V0) & 0x1;
+  if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer))
+    return (Consumer - Hexagon::V0) & 0x1;
   if (Producer2 != Hexagon::NoRegister)
     return Consumer == Producer;
   return 0;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index 70022aaad7122..7b3c079880f8d 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -351,6 +351,16 @@ bool subInstWouldBeExtended(MCInst const &potentialDuplex);
 unsigned SubregisterBit(unsigned Consumer, unsigned Producer,
                         unsigned Producer2);
 
+bool IsVecRegSingle(unsigned VecReg);
+bool IsVecRegPair(unsigned VecReg);
+bool IsReverseVecRegPair(unsigned VecReg);
+bool IsSingleConsumerRefPairProducer(unsigned Producer, unsigned Consumer);
+
+/// Returns an ordered pair of the constituent register ordinals for
+/// each of the elements of \a VecRegPair.  For example, Hexagon::W0 ("v0:1")
+/// returns { 0, 1 } and Hexagon::W1 ("v3:2") returns { 3, 2 }.
+std::pair<unsigned, unsigned> GetVecRegPairIndices(unsigned VecRegPair);
+
 // Attempt to find and replace compound pairs
 void tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                  MCContext &Context, MCInst &MCI);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index ac5ba87c798da..cd721999a110a 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -532,6 +532,10 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
   return F->second;
 }
 
+llvm::ArrayRef<MCPhysReg> Hexagon_MC::GetVectRegRev() {
+  return makeArrayRef(VectRegRev);
+}
+
 namespace {
 class HexagonMCInstrAnalysis : public MCInstrAnalysis {
 public:
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 6cc6f51ab12c4..a089abc3bd0c4 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
 #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
 
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include <cstdint>
 #include <string>
@@ -82,6 +83,8 @@ namespace Hexagon_MC {
   void addArchSubtarget(MCSubtargetInfo const *STI,
                         StringRef FS);
   unsigned GetELFFlags(const MCSubtargetInfo &STI);
+
+  llvm::ArrayRef<MCPhysReg> GetVectRegRev();
 }
 
 MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 225cfa0cc4ef7..754cc94062692 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -653,10 +653,10 @@ void MipsRegisterBankInfo::setRegBank(MachineInstr &MI,
 
 static void
 combineAwayG_UNMERGE_VALUES(LegalizationArtifactCombiner &ArtCombiner,
-                            MachineInstr &MI) {
+                            MachineInstr &MI, GISelObserverWrapper &Observer) {
   SmallVector<Register, 4> UpdatedDefs;
   SmallVector<MachineInstr *, 2> DeadInstrs;
-  ArtCombiner.tryCombineMerges(MI, DeadInstrs, UpdatedDefs);
+  ArtCombiner.tryCombineMerges(MI, DeadInstrs, UpdatedDefs, Observer);
   for (MachineInstr *DeadMI : DeadInstrs)
     DeadMI->eraseFromParent();
 }
@@ -689,7 +689,7 @@ void MipsRegisterBankInfo::applyMappingImpl(
       // not be considered for regbank selection. RegBankSelect for mips
       // visits/makes corresponding G_MERGE first. Combine them here.
       if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
-        combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI);
+        combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI, WrapperObserver);
       // This G_MERGE will be combined away when its corresponding G_UNMERGE
       // gets regBankSelected.
       else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES)
@@ -701,7 +701,7 @@ void MipsRegisterBankInfo::applyMappingImpl(
     return;
   }
   case TargetOpcode::G_UNMERGE_VALUES:
-    combineAwayG_UNMERGE_VALUES(ArtCombiner, MI);
+    combineAwayG_UNMERGE_VALUES(ArtCombiner, MI, WrapperObserver);
     return;
   default:
     break;
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index a135f100ba04b..027e2c2d45afb 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -2632,12 +2632,15 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       // used to provide rounding control: use MXCSR.RC, encoded as 0b100.
       // It's consistent with the other FP instructions, which are usually
       // controlled by MXCSR.
-      InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4);
+      unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr
+                                         : X86::VCVTPS2PHrr;
+      InputReg = fastEmitInst_ri(Opc, RC, InputReg, false, 4);
 
       // Move the lower 32-bits of ResultReg to another register of class GR32.
+      Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr
+                                   : X86::VMOVPDI2DIrr;
       ResultReg = createResultReg(&X86::GR32RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(X86::VMOVPDI2DIrr), ResultReg)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
           .addReg(InputReg, RegState::Kill);
 
       // The result value is in the lower 16-bits of ResultReg.
@@ -2645,19 +2648,21 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
     } else {
       assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
-      // Explicitly sign-extend the input to 32-bit.
-      InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
+      // Explicitly zero-extend the input to 32-bit.
+      InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg,
                             /*Kill=*/false);
 
       // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
       InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
                             InputReg, /*Kill=*/true);
 
-      InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true);
+      unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
+                                         : X86::VCVTPH2PSrr;
+      InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Kill=*/true);
 
       // The result value is in the lower 32-bits of ResultReg.
       // Emit an explicit copy from register class VR128 to register class FR32.
-      ResultReg = createResultReg(&X86::FR32RegClass);
+      ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
           .addReg(InputReg, RegState::Kill);
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 86d06f0fc7296..15745c10b780b 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -5651,7 +5651,7 @@ struct X86MemUnfoldTable {
       addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
 
     for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3)
-      // Index 2, folded broadcast
+      // Index 3, folded broadcast
       addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
 
     // Sort the memory->reg unfold table.
diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index ed38169bb06f8..f92566ba77ce4 100644
--- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -81,10 +81,8 @@ class LoopRotateLegacyPass : public LoopPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    if (EnableMSSALoopDependency) {
-      AU.addRequired<MemorySSAWrapperPass>();
+    if (EnableMSSALoopDependency)
       AU.addPreserved<MemorySSAWrapperPass>();
-    }
     getLoopAnalysisUsage(AU);
   }
 
@@ -101,8 +99,11 @@ class LoopRotateLegacyPass : public LoopPass {
     const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
     Optional<MemorySSAUpdater> MSSAU;
     if (EnableMSSALoopDependency) {
-      MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-      MSSAU = MemorySSAUpdater(MSSA);
+      // Not requiring MemorySSA and getting it only if available will split
+      // the loop pass pipeline when LoopRotate is being run first.
+      auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+      if (MSSAA)
+        MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
     }
     return LoopRotation(L, LI, TTI, AC, &DT, &SE,
                         MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
diff --git a/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll b/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll
deleted file mode 100644
index 152cb175ef608..0000000000000
--- a/llvm/test/Analysis/MemoryDependenceAnalysis/InvariantLoad.ll
+++ /dev/null
@@ -1,173 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -gvn -S | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.memset.p0i8.i8(i8*, i8, i32, i1)
-declare void @foo(i8*)
-
-define i8 @test(i1 %cmp) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P:%.*]] = alloca i8
-; CHECK-NEXT:    store i8 5, i8* [[P]]
-; CHECK-NEXT:    br label [[HEADER:%.*]]
-; CHECK:       header:
-; CHECK-NEXT:    [[V:%.*]] = phi i8 [ 5, [[ENTRY:%.*]] ], [ -5, [[ALIVE:%.*]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[I_INC:%.*]], [[ALIVE]] ]
-; CHECK-NEXT:    br i1 [[CMP:%.*]], label [[ALIVE]], label [[DEAD:%.*]]
-; CHECK:       dead:
-; CHECK-NEXT:    call void @foo(i8* [[P]])
-; CHECK-NEXT:    [[I_1:%.*]] = add i8 [[I]], [[V]]
-; CHECK-NEXT:    br label [[ALIVE]]
-; CHECK:       alive:
-; CHECK-NEXT:    [[I_2:%.*]] = phi i8 [ [[I]], [[HEADER]] ], [ [[I_1]], [[DEAD]] ]
-; CHECK-NEXT:    store i8 -5, i8* [[P]]
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i32(i8* align 1 [[P]], i8 0, i32 1, i1 false)
-; CHECK-NEXT:    [[I_INC]] = add i8 [[I_2]], 1
-; CHECK-NEXT:    [[CMP_LOOP:%.*]] = icmp ugt i8 [[I_INC]], 100
-; CHECK-NEXT:    br i1 [[CMP_LOOP]], label [[EXIT:%.*]], label [[HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret i8 0
-;
-
-entry:
-  %p = alloca i8
-  %addr = getelementptr inbounds i8, i8* %p, i64 0
-  store i8 5, i8* %addr
-  br label %header
-header:
-  %i = phi i8 [0, %entry], [%i.inc, %backedge]
-  br i1 %cmp, label %alive, label %dead
-dead:
-  call void @foo(i8* %p)
-  %v = load i8, i8* %addr, !invariant.load !1
-  %i.1 = add i8 %i, %v
-  br label %alive
-alive:
-  %i.2 = phi i8 [%i, %header], [%i.1, %dead]
-  store i8 -5, i8* %addr
-  br label %backedge
-backedge:
-  call void @llvm.memset.p0i8.i8(i8 * align 1 %p, i8 0, i32 1, i1 false)
-  %i.inc = add i8 %i.2, 1
-  %cmp.loop = icmp ugt i8 %i.inc, 100
-  br i1 %cmp.loop, label %exit, label %header
-exit:
-  %res = load i8, i8* %addr
-  ret i8 %res
-}
-
-; Check that first two loads are not optimized out while the one marked with
-; invariant.load reuses %res1
-define i8 @test2(i1 %cmp, i8 *%p) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[RES1:%.*]] = load i8, i8* [[P:%.*]]
-; CHECK-NEXT:    call void @foo(i8* [[P]])
-; CHECK-NEXT:    br i1 [[CMP:%.*]], label [[B2:%.*]], label [[B1:%.*]]
-; CHECK:       b1:
-; CHECK-NEXT:    [[RES2:%.*]] = load i8, i8* [[P]]
-; CHECK-NEXT:    [[RES3:%.*]] = add i8 [[RES1]], [[RES2]]
-; CHECK-NEXT:    br label [[ALIVE:%.*]]
-; CHECK:       b2:
-; CHECK-NEXT:    [[RES_DEAD:%.*]] = add i8 [[RES1]], [[RES1]]
-; CHECK-NEXT:    br label [[ALIVE]]
-; CHECK:       alive:
-; CHECK-NEXT:    [[RES_PHI:%.*]] = phi i8 [ [[RES3]], [[B1]] ], [ [[RES_DEAD]], [[B2]] ]
-; CHECK-NEXT:    ret i8 [[RES_PHI]]
-;
-
-entry:
-  %res1 = load i8, i8* %p
-  call void @foo(i8 *%p)
-  br i1 %cmp, label %b2, label %b1
-b1:
-  %res2 = load i8, i8* %p
-  %res3 = add i8 %res1, %res2
-  br label %alive
-b2:
-  %v = load i8, i8* %p, !invariant.load !1
-  %res.dead = add i8 %v, %res1
-  br label %alive
-alive:
-  %res.phi = phi i8 [%res3, %b1], [%res.dead, %b2]
-  ret i8 %res.phi
-}
-
-; This is essentially the same test case as the above one but with %b1 and %b2
-; swapped in "br i1 %cmp, label %b1, label %b2" instruction. That helps us to
-; ensure that results doesn't depend on visiting order.
-define i8 @test3(i1 %cmp, i8 *%p) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[RES1:%.*]] = load i8, i8* [[P:%.*]]
-; CHECK-NEXT:    call void @foo(i8* [[P]])
-; CHECK-NEXT:    br i1 [[CMP:%.*]], label [[B1:%.*]], label [[B2:%.*]]
-; CHECK:       b1:
-; CHECK-NEXT:    [[RES2:%.*]] = load i8, i8* [[P]]
-; CHECK-NEXT:    [[RES3:%.*]] = add i8 [[RES1]], [[RES2]]
-; CHECK-NEXT:    br label [[ALIVE:%.*]]
-; CHECK:       b2:
-; CHECK-NEXT:    [[RES_DEAD:%.*]] = add i8 [[RES1]], [[RES1]]
-; CHECK-NEXT:    br label [[ALIVE]]
-; CHECK:       alive:
-; CHECK-NEXT:    [[RES_PHI:%.*]] = phi i8 [ [[RES3]], [[B1]] ], [ [[RES_DEAD]], [[B2]] ]
-; CHECK-NEXT:    ret i8 [[RES_PHI]]
-;
-entry:
-  %res1 = load i8, i8* %p
-  call void @foo(i8 *%p)
-  br i1 %cmp, label %b1, label %b2
-b1:
-  %res2 = load i8, i8* %p
-  %res3 = add i8 %res1, %res2
-  br label %alive
-b2:
-  %v = load i8, i8* %p, !invariant.load !1
-  %res.dead = add i8 %v, %res1
-  br label %alive
-alive:
-  %res.phi = phi i8 [%res3, %b1], [%res.dead, %b2]
-  ret i8 %res.phi
-}
-
-
-; This is reduced test case catching regression in the first version of the
-; fix for invariant loads (https://reviews.llvm.org/D64405).
-define void @test4() {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* inttoptr (i64 8 to float*), align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul float [[TMP0]], [[TMP0]]
-; CHECK-NEXT:    br label [[FUSION_LOOP_HEADER_DIM_1_PREHEADER:%.*]]
-; CHECK:       fusion.loop_header.dim.1.preheader:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]] ]
-; CHECK-NEXT:    [[FUSION_INVAR_ADDRESS_DIM_0_03:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INVAR_INC3:%.*]], [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 [[FUSION_INVAR_ADDRESS_DIM_0_03]], i64 0, i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul float [[TMP2]], [[TMP2]]
-; CHECK-NEXT:    [[INVAR_INC3]] = add nuw nsw i64 [[FUSION_INVAR_ADDRESS_DIM_0_03]], 1
-; CHECK-NEXT:    [[DOTPHI_TRANS_INSERT:%.*]] = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 [[INVAR_INC3]], i64 0, i64 2
-; CHECK-NEXT:    [[DOTPRE]] = load float, float* [[DOTPHI_TRANS_INSERT]], align 4, !invariant.load !0
-; CHECK-NEXT:    br label [[FUSION_LOOP_HEADER_DIM_1_PREHEADER]]
-;
-entry:
-  %0 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 0, i64 0, i64 2
-  %1 = load float, float* %0, align 4
-  %2 = fmul float %1, %1
-  br label %fusion.loop_header.dim.1.preheader
-
-fusion.loop_header.dim.1.preheader:               ; preds = %fusion.loop_header.dim.1.preheader, %entry
-  %fusion.invar_address.dim.0.03 = phi i64 [ 0, %entry ], [ %invar.inc3, %fusion.loop_header.dim.1.preheader ]
-  %3 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 %fusion.invar_address.dim.0.03, i64 0, i64 2
-  %4 = load float, float* %3, align 4, !invariant.load !1
-  %5 = fmul float %4, %4
-  %6 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* null, i64 0, i64 %fusion.invar_address.dim.0.03, i64 0, i64 2
-  %7 = load float, float* %6, align 4, !invariant.load !1
-  %8 = fmul float %7, %7
-  %invar.inc3 = add nuw nsw i64 %fusion.invar_address.dim.0.03, 1
-  br label %fusion.loop_header.dim.1.preheader
-}
-
-!1 = !{}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/artifact-combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/artifact-combine-unmerge.mir
new file mode 100644
index 0000000000000..3e42fd5b99e31
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/artifact-combine-unmerge.mir
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -march=aarch64 -run-pass=legalizer %s | FileCheck %s
+
+# Make sure we don't lose the register bank constraints when
+# artifact combining G_UNMERGE_VALUES instructions.
+---
+name:            test_none_none
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_none_none
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK: $w0 = COPY [[COPY]](s32)
+    ; CHECK: $w1 = COPY [[COPY1]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s64) = G_MERGE_VALUES %0(s32), %1
+    %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %2(s64)
+    $w0 = COPY %3(s32)
+    $w1 = COPY %4(s32)
+...
+---
+name:            test_gpr_none
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_gpr_none
+    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr(s32) = COPY $w1
+    ; CHECK: $w0 = COPY [[COPY]](s32)
+    ; CHECK: $w1 = COPY [[COPY1]](s32)
+    %0:gpr(s32) = COPY $w0
+    %1:gpr(s32) = COPY $w1
+    %2:_(s64) = G_MERGE_VALUES %0(s32), %1
+    %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %2(s64)
+    $w0 = COPY %3(s32)
+    $w1 = COPY %4(s32)
+...
+---
+name:            test_none_gpr
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_none_gpr
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:gpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: $w0 = COPY [[COPY2]](s32)
+    ; CHECK: $w1 = COPY [[COPY3]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s64) = G_MERGE_VALUES %0(s32), %1
+    %3:gpr(s32), %4:gpr(s32) = G_UNMERGE_VALUES %2(s64)
+    $w0 = COPY %3(s32)
+    $w1 = COPY %4(s32)
+...
+---
+name:            test_fpr_gpr
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_fpr_gpr
+    ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $w0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr(s32) = COPY $w1
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:gpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: $w0 = COPY [[COPY2]](s32)
+    ; CHECK: $w1 = COPY [[COPY3]](s32)
+    %0:fpr(s32) = COPY $w0
+    %1:fpr(s32) = COPY $w1
+    %2:_(s64) = G_MERGE_VALUES %0(s32), %1
+    %3:gpr(s32), %4:gpr(s32) = G_UNMERGE_VALUES %2(s64)
+    $w0 = COPY %3(s32)
+    $w1 = COPY %4(s32)
+...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
new file mode 100644
index 0000000000000..b5a05974f72be
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; BIF Bitwise Insert if False
+;
+; 8-bit vectors tests
+
+define <1 x i8> @test_bitf_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) {
+; CHECK-LABEL: test_bitf_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i8> %C, <i8 -1>
+  %and = and <1 x i8> %neg, %B
+  %and1 = and <1 x i8> %C, %A
+  %or = or <1 x i8> %and, %and1
+  ret <1 x i8> %or
+}
+
+; 16-bit vectors tests
+
+define <1 x i16> @test_bitf_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) {
+; CHECK-LABEL: test_bitf_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i16> %C, <i16 -1>
+  %and = and <1 x i16> %neg, %B
+  %and1 = and <1 x i16> %C, %A
+  %or = or <1 x i16> %and, %and1
+  ret <1 x i16> %or
+}
+
+; 32-bit vectors tests
+
+define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
+; CHECK-LABEL: test_bitf_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i32> %C, <i32 -1>
+  %and = and <1 x i32> %neg, %B
+  %and1 = and <1 x i32> %C, %A
+  %or = or <1 x i32> %and, %and1
+  ret <1 x i32> %or
+}
+
+; 64-bit vectors tests
+
+define <1 x i64> @test_bitf_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) {
+; CHECK-LABEL: test_bitf_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i64> %C, <i64 -1>
+  %and = and <1 x i64> %neg, %B
+  %and1 = and <1 x i64> %C, %A
+  %or = or <1 x i64> %and, %and1
+  ret <1 x i64> %or
+}
+
+define <2 x i32> @test_bitf_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+; CHECK-LABEL: test_bitf_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <2 x i32> %C, <i32 -1, i32 -1>
+  %and = and <2 x i32> %neg, %B
+  %and1 = and <2 x i32> %C, %A
+  %or = or <2 x i32> %and, %and1
+  ret <2 x i32> %or
+}
+
+define <4 x i16> @test_bitf_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
+; CHECK-LABEL: test_bitf_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <4 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %and = and <4 x i16> %neg, %B
+  %and1 = and <4 x i16> %C, %A
+  %or = or <4 x i16> %and, %and1
+  ret <4 x i16> %or
+}
+
+define <8 x i8> @test_bitf_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
+; CHECK-LABEL: test_bitf_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <8 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and = and <8 x i8> %neg, %B
+  %and1 = and <8 x i8> %C, %A
+  %or = or <8 x i8> %and, %and1
+  ret <8 x i8> %or
+}
+
+; 128-bit vectors tests
+
+define <2 x i64> @test_bitf_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: test_bitf_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <2 x i64> %C, <i64 -1, i64 -1>
+  %and = and <2 x i64> %neg, %B
+  %and1 = and <2 x i64> %C, %A
+  %or = or <2 x i64> %and, %and1
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @test_bitf_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_bitf_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and = and <4 x i32> %neg, %B
+  %and1 = and <4 x i32> %C, %A
+  %or = or <4 x i32> %and, %and1
+  ret <4 x i32> %or
+}
+
+define <8 x i16> @test_bitf_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: test_bitf_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %and = and <8 x i16> %neg, %B
+  %and1 = and <8 x i16> %C, %A
+  %or = or <8 x i16> %and, %and1
+  ret <8 x i16> %or
+}
+
+define <16 x i8> @test_bitf_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: test_bitf_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and = and <16 x i8> %neg, %B
+  %and1 = and <16 x i8> %C, %A
+  %or = or <16 x i8> %and, %and1
+  ret <16 x i8> %or
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
new file mode 100644
index 0000000000000..f29ea22ff8dbb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; BIT Bitwise Insert if True
+;
+; 8-bit vectors tests
+
+define <1 x i8> @test_bit_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) {
+; CHECK-LABEL: test_bit_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i8> %C, %B
+  %neg = xor <1 x i8> %C, <i8 -1>
+  %and1 = and <1 x i8> %neg, %A
+  %or = or <1 x i8> %and, %and1
+  ret <1 x i8> %or
+}
+
+; 16-bit vectors tests
+
+define <1 x i16> @test_bit_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) {
+; CHECK-LABEL: test_bit_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i16> %C, %B
+  %neg = xor <1 x i16> %C, <i16 -1>
+  %and1 = and <1 x i16> %neg, %A
+  %or = or <1 x i16> %and, %and1
+  ret <1 x i16> %or
+}
+
+; 32-bit vectors tests
+
+define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
+; CHECK-LABEL: test_bit_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i32> %C, %B
+  %neg = xor <1 x i32> %C, <i32 -1>
+  %and1 = and <1 x i32> %neg, %A
+  %or = or <1 x i32> %and, %and1
+  ret <1 x i32> %or
+}
+
+; 64-bit vectors tests
+
+define <1 x i64> @test_bit_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) {
+; CHECK-LABEL: test_bit_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i64> %C, %B
+  %neg = xor <1 x i64> %C, <i64 -1>
+  %and1 = and <1 x i64> %neg, %A
+  %or = or <1 x i64> %and, %and1
+  ret <1 x i64> %or
+}
+
+define <2 x i32> @test_bit_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+; CHECK-LABEL: test_bit_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <2 x i32> %C, %B
+  %neg = xor <2 x i32> %C, <i32 -1, i32 -1>
+  %and1 = and <2 x i32> %neg, %A
+  %or = or <2 x i32> %and, %and1
+  ret <2 x i32> %or
+}
+
+define <4 x i16> @test_bit_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
+; CHECK-LABEL: test_bit_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <4 x i16> %C, %B
+  %neg = xor <4 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %and1 = and <4 x i16> %neg, %A
+  %or = or <4 x i16> %and, %and1
+  ret <4 x i16> %or
+}
+
+define <8 x i8> @test_bit_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
+; CHECK-LABEL: test_bit_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <8 x i8> %C, %B
+  %neg = xor <8 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and1 = and <8 x i8> %neg, %A
+  %or = or <8 x i8> %and, %and1
+  ret <8 x i8> %or
+}
+
+; 128-bit vectors tests
+
+define <2 x i64> @test_bit_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: test_bit_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <2 x i64> %C, %B
+  %neg = xor <2 x i64> %C, <i64 -1, i64 -1>
+  %and1 = and <2 x i64> %neg, %A
+  %or = or <2 x i64> %and, %and1
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @test_bit_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_bit_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <4 x i32> %C, %B
+  %neg = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and1 = and <4 x i32> %neg, %A
+  %or = or <4 x i32> %and, %and1
+  ret <4 x i32> %or
+}
+
+define <8 x i16> @test_bit_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: test_bit_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <8 x i16> %C, %B
+  %neg = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %and1 = and <8 x i16> %neg, %A
+  %or = or <8 x i16> %and, %and1
+  ret <8 x i16> %or
+}
+
+define <16 x i8> @test_bit_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: test_bit_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <16 x i8> %C, %B
+  %neg = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and1 = and <16 x i8> %neg, %A
+  %or = or <16 x i8> %and, %and1
+  ret <16 x i8> %or
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
index 464726b0d2f30..cad3fb58086d6 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -9,8 +9,7 @@ define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.8b, v3.8b, v2.8b
 ; CHECK-NEXT:    dup v2.8b, v2.b[0]
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i8 %a, %b
   %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
@@ -49,8 +48,7 @@ define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    dup v2.16b, v2.b[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i8 %a, %b
   %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
@@ -92,8 +90,7 @@ define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d )
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.4h, v3.4h, v2.4h
 ; CHECK-NEXT:    dup v2.4h, v2.h[0]
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i16 %a, %b
   %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
@@ -107,8 +104,7 @@ define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d )
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.8h, v3.8h, v2.8h
 ; CHECK-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i16 %a, %b
   %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
@@ -122,8 +118,7 @@ define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d )
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.2s, v3.2s, v2.2s
 ; CHECK-NEXT:    dup v2.2s, v2.s[0]
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
@@ -137,8 +132,7 @@ define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d )
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
@@ -151,8 +145,7 @@ define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d )
 ; CHECK-NEXT:    fmov d2, x1
 ; CHECK-NEXT:    fmov d3, x0
 ; CHECK-NEXT:    cmeq d2, d3, d2
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
@@ -166,8 +159,7 @@ define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d )
 ; CHECK-NEXT:    fmov d3, x0
 ; CHECK-NEXT:    cmeq v2.2d, v3.2d, v2.2d
 ; CHECK-NEXT:    dup v2.2d, v2.d[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
@@ -222,8 +214,7 @@ define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <4x float> %c, <4x float> %d
@@ -247,8 +238,7 @@ define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c,
 ; CHECK-NEXT:    fmov d2, x1
 ; CHECK-NEXT:    fmov d3, x0
 ; CHECK-NEXT:    cmeq d2, d3, d2
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
@@ -278,8 +268,7 @@ define <2 x i32> @test_select_cc_v2i32_icmpi1(i1 %cc, <2 x i32> %a, <2 x i32> %b
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csetm w8, ne
 ; CHECK-NEXT:    dup v2.2s, w8
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp = icmp ne i1 %cc, 0
   %e = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
@@ -294,8 +283,7 @@ define <3 x float> @test_select_cc_v3f32_fcmp_f32(<3 x float> %a, <3 x float> %b
 ; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
 ; CHECK-NEXT:    fcmeq v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cc = fcmp oeq float %c1, %c2
   %r = select i1 %cc, <3 x float> %a, <3 x float> %b
@@ -309,8 +297,7 @@ define <3 x float> @test_select_cc_v3f32_fcmp_f64(<3 x float> %a, <3 x float> %b
 ; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
 ; CHECK-NEXT:    fcmeq v2.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    dup v2.2d, v2.d[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cc = fcmp oeq double %c1, %c2
   %r = select i1 %cc, <3 x float> %a, <3 x float> %b
diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
index 59afe47042ffb..bf049c20e6c2d 100644
--- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
@@ -5,8 +5,7 @@
 define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 {
 ; CHECK-LABEL: select_64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 entry:
   %0 = bitcast <4 x half> %a to <4 x i16>
@@ -23,8 +22,7 @@ entry:
 define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 {
 ; CHECK-LABEL: select_128:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %0 = bitcast <8 x half> %a to <8 x i16>
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 4fe52e7cae249..521f7f6521bf0 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -61,8 +61,7 @@ define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b)  {
 ; CHECK-LABEL: bsl8xi8_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 >
 	%tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1 >
@@ -74,8 +73,7 @@ define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: bsl16xi8_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0 >
 	%tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1 >
@@ -664,8 +662,7 @@ define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b)  {
 ; CHECK-LABEL: bsl2xi32_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d2, #0x000000ffffffff
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <2 x i32> %a, < i32 -1, i32 0 >
 	%tmp2 = and <2 x i32> %b, < i32 0, i32 -1 >
@@ -678,8 +675,7 @@ define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b)  {
 ; CHECK-LABEL: bsl4xi16_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <4 x i16> %a, < i16 -1, i16 0, i16 -1,i16 0 >
 	%tmp2 = and <4 x i16> %b, < i16 0, i16 -1,i16 0, i16 -1 >
@@ -691,8 +687,7 @@ define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b)  {
 ; CHECK-LABEL: bsl1xi64_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d2, #0xffffffffffffff00
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <1 x i64> %a, < i64 -256 >
 	%tmp2 = and <1 x i64> %b, < i64 255 >
@@ -704,8 +699,7 @@ define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b)  {
 ; CHECK-LABEL: bsl4xi32_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <4 x i32> %a, < i32 -1, i32 0, i32 -1, i32 0 >
 	%tmp2 = and <4 x i32> %b, < i32 0, i32 -1, i32 0, i32 -1 >
@@ -717,8 +711,7 @@ define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b)  {
 ; CHECK-LABEL: bsl8xi16_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 0,i16 0, i16 -1, i16 -1, i16 0,i16 0 >
 	%tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 -1, i16 -1 >
@@ -731,8 +724,7 @@ define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b)  {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI75_0
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI75_0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <2 x i64> %a, < i64 -1, i64 0 >
 	%tmp2 = and <2 x i64> %b, < i64 0, i64 -1 >
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index 8e54d91662775..08adbd1507220 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -480,9 +480,9 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    mov w9, #42
 ; CHECK-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    dup v0.2d, x9
-; CHECK-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    dup v1.2d, x9
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %c = icmp ult <2 x i64> %x, <i64 -43, i64 -43>
   %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> <i64 -43, i64 -43>
@@ -653,8 +653,8 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64>
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v2.16b, v1.16b
 ; CHECK-NEXT:    cmhi v3.2d, v2.2d, v0.2d
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v0.2d, v3.2d, v1.2d
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %noty = xor <2 x i64> %y, <i64 -1, i64 -1>
   %c = icmp ult <2 x i64> %x, %noty
diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
index a559b7868575f..cb1fac16aa9c1 100644
--- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -71,10 +71,9 @@ define <2 x float> @f2sqrt(<2 x float> %a) #0 {
 ; CHECK-NEXT:    fmul v2.2s, v1.2s, v1.2s
 ; CHECK-NEXT:    frsqrts v2.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    fmul v2.2s, v2.2s, v0.2s
-; CHECK-NEXT:    fmul v2.2s, v1.2s, v2.2s
-; CHECK-NEXT:    fcmeq v1.2s, v0.2s, #0.0
-; CHECK-NEXT:    bsl v1.8b, v0.8b, v2.8b
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v1.2s, v1.2s, v2.2s
+; CHECK-NEXT:    fcmeq v2.2s, v0.2s, #0.0
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a)
   ret <2 x float> %1
@@ -95,10 +94,9 @@ define <4 x float> @f4sqrt(<4 x float> %a) #0 {
 ; CHECK-NEXT:    fmul v2.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    frsqrts v2.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    fmul v2.4s, v2.4s, v0.4s
-; CHECK-NEXT:    fmul v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fcmeq v1.4s, v0.4s, #0.0
-; CHECK-NEXT:    bsl v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fcmeq v2.4s, v0.4s, #0.0
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
   ret <4 x float> %1
@@ -120,21 +118,19 @@ define <8 x float> @f8sqrt(<8 x float> %a) #0 {
 ; CHECK-NEXT:    fmul v3.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    frsqrts v3.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    fmul v3.4s, v3.4s, v0.4s
-; CHECK-NEXT:    fmul v3.4s, v2.4s, v3.4s
-; CHECK-NEXT:    fcmeq v2.4s, v0.4s, #0.0
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v3.16b
-; CHECK-NEXT:    frsqrte v0.4s, v1.4s
-; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, #0.0
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    frsqrte v2.4s, v1.4s
+; CHECK-NEXT:    fmul v3.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    frsqrts v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fmul v3.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    frsqrts v3.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    fmul v3.4s, v3.4s, v1.4s
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    fcmeq v3.4s, v1.4s, #0.0
-; CHECK-NEXT:    bsl v3.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a)
   ret <8 x float> %1
@@ -210,10 +206,9 @@ define <2 x double> @d2sqrt(<2 x double> %a) #0 {
 ; CHECK-NEXT:    fmul v2.2d, v1.2d, v1.2d
 ; CHECK-NEXT:    frsqrts v2.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    fmul v2.2d, v2.2d, v0.2d
-; CHECK-NEXT:    fmul v2.2d, v1.2d, v2.2d
-; CHECK-NEXT:    fcmeq v1.2d, v0.2d, #0.0
-; CHECK-NEXT:    bsl v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    fcmeq v2.2d, v0.2d, #0.0
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
   ret <2 x double> %1
@@ -238,24 +233,22 @@ define <4 x double> @d4sqrt(<4 x double> %a) #0 {
 ; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v0.2d, v3.2d
 ; CHECK-NEXT:    fmul v3.2d, v3.2d, v0.2d
-; CHECK-NEXT:    fmul v3.2d, v2.2d, v3.2d
-; CHECK-NEXT:    fcmeq v2.2d, v0.2d, #0.0
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v3.16b
-; CHECK-NEXT:    frsqrte v0.2d, v1.2d
-; CHECK-NEXT:    fmul v3.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fcmeq v3.2d, v0.2d, #0.0
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    frsqrte v2.2d, v1.2d
+; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v1.2d, v3.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    fmul v3.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v1.2d, v3.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    fmul v3.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v1.2d, v3.2d
 ; CHECK-NEXT:    fmul v3.2d, v3.2d, v1.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v3.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    fcmeq v3.2d, v1.2d, #0.0
-; CHECK-NEXT:    bsl v3.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a)
   ret <4 x double> %1
diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
index 0e2c891816c1d..2e385fdd6f25f 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
@@ -62,8 +62,7 @@ define <4 x i32> @out_constant_varx_42(<4 x i32> %x, <4 x i32> %y, <4 x i32> %ma
 ; CHECK-LABEL: out_constant_varx_42:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %mask, %x
@@ -76,8 +75,7 @@ define <4 x i32> @in_constant_varx_42(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mas
 ; CHECK-LABEL: in_constant_varx_42:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
   %n1 = and <4 x i32> %n0, %mask
@@ -90,8 +88,7 @@ define <4 x i32> @out_constant_varx_42_invmask(<4 x i32> %x, <4 x i32> %y, <4 x
 ; CHECK-LABEL: out_constant_varx_42_invmask:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %notmask, %x
@@ -105,8 +102,7 @@ define <4 x i32> @in_constant_varx_42_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i
 ; CHECK-LABEL: in_constant_varx_42_invmask:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
@@ -169,9 +165,8 @@ define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x
 define <4 x i32> @out_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: out_constant_42_vary:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
@@ -183,9 +178,8 @@ define <4 x i32> @out_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %ma
 define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: in_constant_42_vary:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
   %n1 = and <4 x i32> %n0, %mask
@@ -197,9 +191,8 @@ define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mas
 define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: out_constant_42_vary_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
@@ -212,9 +205,8 @@ define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x
 define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: in_constant_42_vary_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
index df86540fdd964..607f5dd3dc772 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
@@ -13,8 +13,7 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i8> %x, %mask
   %notmask = xor <1 x i8> %mask, <i8 -1>
@@ -46,8 +45,7 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i16> %x, %mask
   %notmask = xor <1 x i16> %mask, <i16 -1>
@@ -111,8 +109,7 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin
 define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i32> %x, %mask
   %notmask = xor <1 x i32> %mask, <i32 -1>
@@ -128,8 +125,7 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin
 define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <8 x i8> %x, %mask
   %notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -141,8 +137,7 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i16> %x, %mask
   %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
@@ -154,8 +149,7 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v4i16_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i16> %x, %mask
   %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
@@ -167,8 +161,7 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <2 x i32> %x, %mask
   %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
@@ -180,8 +173,7 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin
 define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i64> %x, %mask
   %notmask = xor <1 x i64> %mask, <i64 -1>
@@ -197,8 +189,7 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin
 define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <16 x i8> %x, %mask
   %notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -210,8 +201,7 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
 define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <8 x i16> %x, %mask
   %notmask = xor <8 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -223,8 +213,7 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -236,8 +225,7 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin
 define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
@@ -249,8 +237,7 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n
 define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <2 x i64> %x, %mask
   %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
@@ -270,8 +257,7 @@ define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwin
 define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i8> %x, %y
   %n1 = and <1 x i8> %n0, %mask
@@ -286,8 +272,7 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i8> %x, %y
   %n1 = and <2 x i8> %n0, %mask
@@ -298,8 +283,7 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i16> %x, %y
   %n1 = and <1 x i16> %n0, %mask
@@ -314,8 +298,7 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind
 define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i8> %x, %y
   %n1 = and <4 x i8> %n0, %mask
@@ -326,8 +309,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i16> %x, %y
   %n1 = and <2 x i16> %n0, %mask
@@ -338,8 +320,7 @@ define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind
 define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i32> %x, %y
   %n1 = and <1 x i32> %n0, %mask
@@ -354,8 +335,7 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind
 define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <8 x i8> %x, %y
   %n1 = and <8 x i8> %n0, %mask
@@ -366,8 +346,7 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i16> %x, %y
   %n1 = and <4 x i16> %n0, %mask
@@ -378,8 +357,7 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind
 define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i32> %x, %y
   %n1 = and <2 x i32> %n0, %mask
@@ -390,8 +368,7 @@ define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind
 define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: in_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i64> %x, %y
   %n1 = and <1 x i64> %n0, %mask
@@ -406,8 +383,7 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind
 define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <16 x i8> %x, %y
   %n1 = and <16 x i8> %n0, %mask
@@ -418,8 +394,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <8 x i16> %x, %y
   %n1 = and <8 x i16> %n0, %mask
@@ -430,8 +405,7 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind
 define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> %x, %y
   %n1 = and <4 x i32> %n0, %mask
@@ -442,8 +416,7 @@ define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind
 define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: in_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i64> %x, %y
   %n1 = and <2 x i64> %n0, %mask
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
index de3c1fafb6de6..3f2ec1e17894f 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
@@ -318,8 +318,8 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI11_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -351,8 +351,8 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI12_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -577,8 +577,8 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI20_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -606,8 +606,8 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI21_3]
 ; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -637,8 +637,8 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI22_3]
 ; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -670,8 +670,8 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI23_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -699,8 +699,8 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI24_3]
 ; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -729,8 +729,8 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI25_3]
 ; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -761,8 +761,8 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI26_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
index 5ee7c2a9aee96..85abb4d7f8303 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
@@ -104,8 +104,8 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v5.4s
 ; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI4_4]
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, v5.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
new file mode 100644
index 0000000000000..a374369478d12
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -0,0 +1,594 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -o - %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+
+define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) {
+; GFX7-LABEL: s_bswap_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_alignbit_b32 v0, s0, s0, 8
+; GFX7-NEXT:    v_alignbit_b32 v1, s0, s0, 24
+; GFX7-NEXT:    s_mov_b32 s0, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s0, v1, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_bswap_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_mov_b32 s0, 0x10203
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_bswap_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x10203
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %bswap = call i32 @llvm.bswap.i32(i32 %src)
+  %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap)
+  ret i32 %to.sgpr
+}
+
+define i32 @v_bswap_i32(i32 %src) {
+; GFX7-LABEL: v_bswap_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_alignbit_b32 v1, v0, v0, 8
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_bswap_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, 0x10203
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bswap_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0x10203
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call i32 @llvm.bswap.i32(i32 %src)
+  ret i32 %bswap
+}
+
+define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
+; GFX7-LABEL: s_bswap_v2i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_alignbit_b32 v0, s0, s0, 8
+; GFX7-NEXT:    v_alignbit_b32 v1, s0, s0, 24
+; GFX7-NEXT:    s_mov_b32 s0, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s0, v1, v0
+; GFX7-NEXT:    v_alignbit_b32 v1, s1, s1, 8
+; GFX7-NEXT:    v_alignbit_b32 v2, s1, s1, 24
+; GFX7-NEXT:    v_bfi_b32 v1, s0, v2, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_bswap_v2i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_mov_b32 s0, 0x10203
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s0
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_bswap_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x10203
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s0
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
+  %bswap.0 = extractelement <2 x i32> %bswap, i32 0
+  %bswap.1 = extractelement <2 x i32> %bswap, i32 1
+  %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0)
+  %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1)
+  %ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1
+  ret <2 x i32> %ins.1
+}
+
+define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
+; GFX7-LABEL: v_bswap_v2i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_alignbit_b32 v2, v0, v0, 8
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v2
+; GFX7-NEXT:    v_alignbit_b32 v2, v1, v1, 8
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
+; GFX7-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_bswap_v2i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, 0x10203
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bswap_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0x10203
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
+  ret <2 x i32> %bswap
+}
+
+define amdgpu_ps <2 x i32> @s_bswap_i64(i64 inreg %src) {
+; GFX7-LABEL: s_bswap_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_alignbit_b32 v0, s1, s1, 8
+; GFX7-NEXT:    v_alignbit_b32 v1, s1, s1, 24
+; GFX7-NEXT:    s_mov_b32 s1, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s1, v1, v0
+; GFX7-NEXT:    v_alignbit_b32 v1, s0, s0, 8
+; GFX7-NEXT:    v_alignbit_b32 v2, s0, s0, 24
+; GFX7-NEXT:    v_bfi_b32 v1, s1, v2, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_bswap_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_mov_b32 s1, 0x10203
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s1
+; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_bswap_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_mov_b32 s1, 0x10203
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s1
+; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %bswap = call i64 @llvm.bswap.i64(i64 %src)
+  %cast = bitcast i64 %bswap to <2 x i32>
+  %elt0 = extractelement <2 x i32> %cast, i32 0
+  %elt1 = extractelement <2 x i32> %cast, i32 1
+  %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+  %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+  %ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1
+  ret <2 x i32> %ins.1
+}
+
+define i64 @v_bswap_i64(i64 %src) {
+; GFX7-LABEL: v_bswap_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_alignbit_b32 v2, v1, v1, 8
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
+; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v2, s4, v1, v2
+; GFX7-NEXT:    v_alignbit_b32 v1, v0, v0, 8
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; GFX7-NEXT:    v_bfi_b32 v1, s4, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_bswap_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, 0x10203
+; GFX8-NEXT:    v_perm_b32 v2, 0, v1, s4
+; GFX8-NEXT:    v_perm_b32 v1, 0, v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bswap_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0x10203
+; GFX9-NEXT:    v_perm_b32 v2, 0, v1, s4
+; GFX9-NEXT:    v_perm_b32 v1, 0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call i64 @llvm.bswap.i64(i64 %src)
+  ret i64 %bswap
+}
+
+define amdgpu_ps <4 x i32> @s_bswap_v2i64(<2 x i64> inreg %src) {
+; GFX7-LABEL: s_bswap_v2i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_alignbit_b32 v0, s1, s1, 8
+; GFX7-NEXT:    v_alignbit_b32 v1, s1, s1, 24
+; GFX7-NEXT:    s_mov_b32 s1, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s1, v1, v0
+; GFX7-NEXT:    v_alignbit_b32 v1, s0, s0, 8
+; GFX7-NEXT:    v_alignbit_b32 v2, s0, s0, 24
+; GFX7-NEXT:    v_bfi_b32 v1, s1, v2, v1
+; GFX7-NEXT:    v_alignbit_b32 v2, s3, s3, 8
+; GFX7-NEXT:    v_alignbit_b32 v3, s3, s3, 24
+; GFX7-NEXT:    v_bfi_b32 v2, s1, v3, v2
+; GFX7-NEXT:    v_alignbit_b32 v3, s2, s2, 8
+; GFX7-NEXT:    v_alignbit_b32 v4, s2, s2, 24
+; GFX7-NEXT:    v_bfi_b32 v3, s1, v4, v3
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_bswap_v2i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_mov_b32 s1, 0x10203
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s1
+; GFX8-NEXT:    v_perm_b32 v2, 0, v2, s1
+; GFX8-NEXT:    v_perm_b32 v3, 0, v3, s1
+; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_bswap_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_mov_b32 s1, 0x10203
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s1
+; GFX9-NEXT:    v_perm_b32 v2, 0, v2, s1
+; GFX9-NEXT:    v_perm_b32 v3, 0, v3, s1
+; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX9-NEXT:    ; return to shader part epilog
+  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
+  %cast = bitcast <2 x i64> %bswap to <4 x i32>
+  %bswap.0 = extractelement <4 x i32> %cast, i32 0
+  %bswap.1 = extractelement <4 x i32> %cast, i32 1
+  %bswap.2 = extractelement <4 x i32> %cast, i32 2
+  %bswap.3 = extractelement <4 x i32> %cast, i32 3
+  %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0)
+  %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1)
+  %to.sgpr2 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.2)
+  %to.sgpr3 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.3)
+  %ins.0 = insertelement <4 x i32> undef, i32 %to.sgpr0, i32 0
+  %ins.1 = insertelement <4 x i32> %ins.0, i32 %to.sgpr1, i32 1
+  %ins.2 = insertelement <4 x i32> %ins.1, i32 %to.sgpr2, i32 2
+  %ins.3 = insertelement <4 x i32> %ins.2, i32 %to.sgpr3, i32 3
+  ret <4 x i32> %ins.3
+}
+
+define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
+; GFX7-LABEL: v_bswap_v2i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_alignbit_b32 v4, v1, v1, 8
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
+; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v4, s4, v1, v4
+; GFX7-NEXT:    v_alignbit_b32 v1, v0, v0, 8
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; GFX7-NEXT:    v_bfi_b32 v1, s4, v0, v1
+; GFX7-NEXT:    v_alignbit_b32 v0, v3, v3, 8
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v3, 24
+; GFX7-NEXT:    v_bfi_b32 v5, s4, v3, v0
+; GFX7-NEXT:    v_alignbit_b32 v0, v2, v2, 8
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v2, 24
+; GFX7-NEXT:    v_bfi_b32 v3, s4, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v2, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_bswap_v2i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, 0x10203
+; GFX8-NEXT:    v_perm_b32 v4, 0, v1, s4
+; GFX8-NEXT:    v_perm_b32 v5, 0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, 0, v0, s4
+; GFX8-NEXT:    v_perm_b32 v3, 0, v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v2, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bswap_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0x10203
+; GFX9-NEXT:    v_perm_b32 v4, 0, v1, s4
+; GFX9-NEXT:    v_perm_b32 v5, 0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, 0, v0, s4
+; GFX9-NEXT:    v_perm_b32 v3, 0, v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
+  ret <2 x i64> %bswap
+}
+
+define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
+; GFX7-LABEL: s_bswap_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_alignbit_b32 v0, s0, s0, 8
+; GFX7-NEXT:    v_alignbit_b32 v1, s0, s0, 24
+; GFX7-NEXT:    s_mov_b32 s0, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s0, v1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_bswap_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_mov_b32 s0, 0x10203
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_bswap_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x10203
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %bswap = call i16 @llvm.bswap.i16(i16 %src)
+  %zext = zext i16 %bswap to i32
+  %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  %trunc = trunc i32 %to.sgpr to i16
+  ret i16 %trunc
+}
+
+define i16 @v_bswap_i16(i16 %src) {
+; GFX7-LABEL: v_bswap_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_alignbit_b32 v1, v0, v0, 8
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_bswap_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, 0x10203
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bswap_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0x10203
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call i16 @llvm.bswap.i16(i16 %src)
+  ret i16 %bswap
+}
+
+define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
+; GFX7-LABEL: s_bswap_v2i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_alignbit_b32 v0, s0, s0, 8
+; GFX7-NEXT:    v_alignbit_b32 v1, s0, s0, 24
+; GFX7-NEXT:    s_mov_b32 s0, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s0, v1, v0
+; GFX7-NEXT:    v_alignbit_b32 v1, s1, s1, 8
+; GFX7-NEXT:    v_alignbit_b32 v2, s1, s1, 24
+; GFX7-NEXT:    v_bfi_b32 v1, s0, v2, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_bswap_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_mov_b32 s0, 0x10203
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s0
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_bswap_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x10203
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s0
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
+  %cast0 = bitcast <2 x i16> %bswap to i32
+  %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0)
+  ret i32 %to.sgpr
+}
+
+define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
+; GFX7-LABEL: v_bswap_i16_zext_to_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_alignbit_b32 v1, v0, v0, 8
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_bswap_i16_zext_to_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, 0x10203
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bswap_i16_zext_to_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0x10203
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call i16 @llvm.bswap.i16(i16 %src)
+  %zext = zext i16 %bswap to i32
+  ret i32 %zext
+}
+
+define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
+; GFX7-LABEL: v_bswap_i16_sext_to_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_alignbit_b32 v1, v0, v0, 8
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_bswap_i16_sext_to_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, 0x10203
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bswap_i16_sext_to_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0x10203
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call i16 @llvm.bswap.i16(i16 %src)
+  %zext = sext i16 %bswap to i32
+  ret i32 %zext
+}
+
+define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
+; GFX7-LABEL: v_bswap_v2i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_alignbit_b32 v2, v0, v0, 8
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
+; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v2
+; GFX7-NEXT:    v_alignbit_b32 v2, v1, v1, 8
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
+; GFX7-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_bswap_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x10203
+; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s4
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bswap_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x10203
+; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s4
+; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
+  ret <2 x i16> %bswap
+}
+
+; FIXME
+; define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
+;   %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %ext.src)
+;   ret <3 x i16> %bswap
+; }
+
+declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+declare i16 @llvm.bswap.i16(i16) #1
+declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) #1
+declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) #1
+declare i32 @llvm.bswap.i32(i32) #1
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) #1
+declare i64 @llvm.bswap.i64(i64) #1
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) #1
+
+attributes #0 = { convergent nounwind readnone }
+attributes #1 = { nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir
new file mode 100644
index 0000000000000..2200618ee04e2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir
@@ -0,0 +1,28 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX8 %s
+
+---
+name: bswap_i32_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX7-LABEL: name: bswap_i32_vv
+    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7: [[V_ALIGNBIT_B32_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32 [[COPY]], [[COPY]], 8, implicit $exec
+    ; GFX7: [[V_ALIGNBIT_B32_1:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32 [[COPY]], [[COPY]], 24, implicit $exec
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16711935
+    ; GFX7: [[V_BFI_B32_:%[0-9]+]]:vgpr_32 = V_BFI_B32 [[S_MOV_B32_]], [[V_ALIGNBIT_B32_1]], [[V_ALIGNBIT_B32_]], implicit $exec
+    ; GFX7: S_ENDPGM 0, implicit [[V_BFI_B32_]]
+    ; GFX8-LABEL: name: bswap_i32_vv
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051
+    ; GFX8: [[V_PERM_B32_:%[0-9]+]]:vgpr_32 = V_PERM_B32 0, [[COPY]], [[S_MOV_B32_]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_PERM_B32_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = G_BSWAP %0
+    S_ENDPGM 0, implicit %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir
index 36539926c3658..a7c4773c20d1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir
@@ -129,3 +129,20 @@ body: |
     $vgpr0_vgpr1 = COPY %1
 ...
 
+---
+name: bswap_s64
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: bswap_s64
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[UV1]]
+    ; CHECK: [[BSWAP1:%[0-9]+]]:_(s32) = G_BSWAP [[UV]]
+    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BSWAP]](s32), [[BSWAP1]](s32)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = G_BSWAP %0
+    $vgpr0_vgpr1 = COPY %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
index eb660979a9ce5..fe05c7b2ff469 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
@@ -79,3 +79,417 @@ body: |
     %2:_(<2 x s32>) = G_ANYEXT %1
     $vgpr0_vgpr1 = COPY %2
 ...
+
+---
+name: test_fptrunc_s64_to_s16
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: test_fptrunc_s64_to_s16
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008
+    ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[C1]]
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32)
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C5]]
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV]]
+    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]]
+    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512
+    ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]]
+    ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]]
+    ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]]
+    ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C9]](s32)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]]
+    ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND]]
+    ; CHECK: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]]
+    ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; CHECK: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]]
+    ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]]
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32)
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32)
+    ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]]
+    ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]]
+    ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND]](s32), [[C10]]
+    ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]]
+    ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]]
+    ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32)
+    ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]]
+    ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1)
+    ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]]
+    ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]]
+    ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]]
+    ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
+    ; CHECK: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND]](s32), [[C17]]
+    ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]]
+    ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039
+    ; CHECK: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C18]]
+    ; CHECK: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]]
+    ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C19]](s32)
+    ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]]
+    ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR7]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY1]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s16) = G_FPTRUNC %0
+    %2:_(s32) = G_ANYEXT %1
+    $vgpr0 = COPY %2
+...
+
+---
+name: test_fptrunc_v2s64_to_v2s16
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: test_fptrunc_v2s64_to_v2s16
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+    ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008
+    ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[C1]]
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32)
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C5]]
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV2]]
+    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]]
+    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512
+    ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]]
+    ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]]
+    ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]]
+    ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C9]](s32)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]]
+    ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND]]
+    ; CHECK: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]]
+    ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; CHECK: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]]
+    ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]]
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32)
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32)
+    ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]]
+    ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]]
+    ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND]](s32), [[C10]]
+    ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]]
+    ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]]
+    ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32)
+    ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]]
+    ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1)
+    ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]]
+    ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]]
+    ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]]
+    ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
+    ; CHECK: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND]](s32), [[C17]]
+    ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]]
+    ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039
+    ; CHECK: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C18]]
+    ; CHECK: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]]
+    ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C19]](s32)
+    ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]]
+    ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]]
+    ; CHECK: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
+    ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32)
+    ; CHECK: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[LSHR5]], [[C1]]
+    ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C2]]
+    ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32)
+    ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
+    ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C5]]
+    ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[UV4]]
+    ; CHECK: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR8]](s32), [[C6]]
+    ; CHECK: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP8]](s1)
+    ; CHECK: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[ZEXT4]]
+    ; CHECK: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR9]](s32), [[C6]]
+    ; CHECK: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s1), [[C7]], [[C6]]
+    ; CHECK: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[C8]]
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C9]](s32)
+    ; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL2]]
+    ; CHECK: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND5]]
+    ; CHECK: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[C6]]
+    ; CHECK: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[C11]]
+    ; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[C12]]
+    ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[OR12]], [[SMIN1]](s32)
+    ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[SMIN1]](s32)
+    ; CHECK: [[ICMP10:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL3]](s32), [[OR12]]
+    ; CHECK: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP10]](s1)
+    ; CHECK: [[OR13:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[ZEXT5]]
+    ; CHECK: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND5]](s32), [[C10]]
+    ; CHECK: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP11]](s1), [[OR13]], [[OR11]]
+    ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C13]]
+    ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SELECT5]], [[C14]](s32)
+    ; CHECK: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND8]](s32), [[C15]]
+    ; CHECK: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP12]](s1)
+    ; CHECK: [[ICMP13:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND8]](s32), [[C16]]
+    ; CHECK: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP13]](s1)
+    ; CHECK: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[ZEXT7]]
+    ; CHECK: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR8]], [[OR14]]
+    ; CHECK: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND5]](s32), [[C17]]
+    ; CHECK: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP14]](s1), [[C8]], [[ADD3]]
+    ; CHECK: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND5]](s32), [[C18]]
+    ; CHECK: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP15]](s1), [[OR10]], [[SELECT6]]
+    ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C19]](s32)
+    ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C20]]
+    ; CHECK: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SELECT7]]
+    ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR7]](s32)
+    ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C21]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR15]](s32)
+    ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C21]]
+    ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C19]](s32)
+    ; CHECK: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL4]]
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32)
+    ; CHECK: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %1:_(<2 x s16>) = G_FPTRUNC %0
+    $vgpr0 = COPY %1
+...
+
+---
+name: test_fptrunc_s64_to_s16_afn
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: test_fptrunc_s64_to_s16_afn
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008
+    ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[C1]]
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32)
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C5]]
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV]]
+    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]]
+    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512
+    ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]]
+    ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]]
+    ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]]
+    ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C9]](s32)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]]
+    ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND]]
+    ; CHECK: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]]
+    ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; CHECK: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]]
+    ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]]
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32)
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32)
+    ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]]
+    ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]]
+    ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND]](s32), [[C10]]
+    ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]]
+    ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]]
+    ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32)
+    ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]]
+    ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1)
+    ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]]
+    ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]]
+    ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]]
+    ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
+    ; CHECK: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND]](s32), [[C17]]
+    ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]]
+    ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039
+    ; CHECK: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C18]]
+    ; CHECK: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]]
+    ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C19]](s32)
+    ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]]
+    ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR7]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY1]](s32)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s16) = G_FPTRUNC %0
+    %2:_(s32) = afn G_ANYEXT %1
+    $vgpr0 = COPY %2
+...
+
+---
+name: test_fptrunc_v2s64_to_v2s16_afn
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: test_fptrunc_v2s64_to_v2s16_afn
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+    ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008
+    ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[C1]]
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C2]]
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32)
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C5]]
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV2]]
+    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]]
+    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512
+    ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]]
+    ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]]
+    ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]]
+    ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C9]](s32)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]]
+    ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND]]
+    ; CHECK: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]]
+    ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; CHECK: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]]
+    ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]]
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32)
+    ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32)
+    ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]]
+    ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]]
+    ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND]](s32), [[C10]]
+    ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]]
+    ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]]
+    ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32)
+    ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]]
+    ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1)
+    ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]]
+    ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]]
+    ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]]
+    ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
+    ; CHECK: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND]](s32), [[C17]]
+    ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]]
+    ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039
+    ; CHECK: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C18]]
+    ; CHECK: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]]
+    ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C19]](s32)
+    ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]]
+    ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]]
+    ; CHECK: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
+    ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32)
+    ; CHECK: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[LSHR5]], [[C1]]
+    ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C2]]
+    ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32)
+    ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
+    ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C5]]
+    ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[UV4]]
+    ; CHECK: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR8]](s32), [[C6]]
+    ; CHECK: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP8]](s1)
+    ; CHECK: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[ZEXT4]]
+    ; CHECK: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR9]](s32), [[C6]]
+    ; CHECK: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s1), [[C7]], [[C6]]
+    ; CHECK: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[C8]]
+    ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C9]](s32)
+    ; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL2]]
+    ; CHECK: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[AND5]]
+    ; CHECK: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[C6]]
+    ; CHECK: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[C11]]
+    ; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[C12]]
+    ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[OR12]], [[SMIN1]](s32)
+    ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[SMIN1]](s32)
+    ; CHECK: [[ICMP10:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL3]](s32), [[OR12]]
+    ; CHECK: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP10]](s1)
+    ; CHECK: [[OR13:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[ZEXT5]]
+    ; CHECK: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[AND5]](s32), [[C10]]
+    ; CHECK: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP11]](s1), [[OR13]], [[OR11]]
+    ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C13]]
+    ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SELECT5]], [[C14]](s32)
+    ; CHECK: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND8]](s32), [[C15]]
+    ; CHECK: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP12]](s1)
+    ; CHECK: [[ICMP13:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND8]](s32), [[C16]]
+    ; CHECK: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP13]](s1)
+    ; CHECK: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[ZEXT7]]
+    ; CHECK: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR8]], [[OR14]]
+    ; CHECK: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND5]](s32), [[C17]]
+    ; CHECK: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP14]](s1), [[C8]], [[ADD3]]
+    ; CHECK: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND5]](s32), [[C18]]
+    ; CHECK: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP15]](s1), [[OR10]], [[SELECT6]]
+    ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C19]](s32)
+    ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C20]]
+    ; CHECK: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SELECT7]]
+    ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR7]](s32)
+    ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C21]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR15]](s32)
+    ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C21]]
+    ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C19]](s32)
+    ; CHECK: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL4]]
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32)
+    ; CHECK: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %1:_(<2 x s16>) = afn G_FPTRUNC %0
+    $vgpr0 = COPY %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
index f78f17ebfe54f..30065daa3bde2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 
 ---
 name: test_sdiv_s32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
index 5402048fae983..aaf6431f2d933 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 
 ---
 name: test_srem_s32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
index f6b43a81e4e1a..3a2e294741182 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 
 ---
 name: test_udiv_s32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
index 29e2e12bdd6be..efaa4f39b1908 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 
 ---
 name: test_urem_s32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir
index 9850b87959af3..818c9368ea9e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir
@@ -11,7 +11,8 @@ body: |
     liveins: $sgpr0
     ; CHECK-LABEL: name: bswap_i32_s
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
-    ; CHECK: [[BSWAP:%[0-9]+]]:sgpr(s32) = G_BSWAP [[COPY]]
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[BSWAP:%[0-9]+]]:vgpr(s32) = G_BSWAP [[COPY1]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = G_BSWAP %0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index daf56e41522a2..0f4c09433c1cc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -5505,8 +5505,13 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 
 define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
 ; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
-; CHECK-NEXT:    [[R:%.*]] = udiv <2 x i64> [[X:%.*]], <i64 4096, i64 4096>
-; CHECK-NEXT:    store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
 ;
 ; GCN-LABEL: udiv_v2i64_pow2k_denom:
@@ -5516,8 +5521,8 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
 ; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -5531,8 +5536,13 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 
 define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
 ; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
-; CHECK-NEXT:    [[R:%.*]] = udiv <2 x i64> [[X:%.*]], <i64 4096, i64 4095>
-; CHECK-NEXT:    store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
 ;
 ; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom:
@@ -5540,7 +5550,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_movk_i32 s4, 0xf001
+; GCN-NEXT:    s_movk_i32 s6, 0xf001
 ; GCN-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5549,11 +5559,13 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GCN-NEXT:    s_movk_i32 s0, 0xfff
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s6
+; GCN-NEXT:    v_mul_lo_u32 v5, v1, s6
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s6
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GCN-NEXT:    v_mul_lo_u32 v5, v1, s4
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v3
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
@@ -5571,19 +5583,17 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
-; GCN-NEXT:    v_mul_hi_u32 v5, v0, s4
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, s6
 ; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v1, v4, s[2:3]
-; GCN-NEXT:    v_mul_lo_u32 v6, v3, s4
-; GCN-NEXT:    v_mul_lo_u32 v8, v0, s4
+; GCN-NEXT:    v_mul_lo_u32 v6, v3, s6
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, s6
 ; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v9, v0, v8
 ; GCN-NEXT:    v_mul_hi_u32 v10, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v11, v3, v5
-; GCN-NEXT:    s_movk_i32 s0, 0xfff
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v7, v10, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v10, v3, v8
@@ -5608,6 +5618,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v6, v2, vcc
@@ -5641,9 +5652,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
-; GCN-NEXT:    s_lshr_b64 s[0:1], s[8:9], 12
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
@@ -5654,8 +5664,15 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = udiv <2 x i64> [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
 ;
 ; GCN-LABEL: udiv_v2i64_pow2_shl_denom:
@@ -5666,10 +5683,10 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_i32 s1, s2, 12
 ; GCN-NEXT:    s_add_i32 s0, s0, 12
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[10:11], s1
+; GCN-NEXT:    s_add_i32 s2, s2, 12
 ; GCN-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -5874,8 +5891,13 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 
 define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
 ; CHECK-LABEL: @urem_v2i64_pow2k_denom(
-; CHECK-NEXT:    [[R:%.*]] = urem <2 x i64> [[X:%.*]], <i64 4096, i64 4096>
-; CHECK-NEXT:    store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
 ;
 ; GCN-LABEL: urem_v2i64_pow2k_denom:
@@ -5887,8 +5909,8 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s1, s2, s8
 ; GCN-NEXT:    s_and_b32 s0, s0, s8
+; GCN-NEXT:    s_and_b32 s1, s2, s8
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
@@ -5902,8 +5924,15 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = urem <2 x i64> [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
 ;
 ; GCN-LABEL: urem_v2i64_pow2_shl_denom:
@@ -5916,14 +5945,14 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b64 s[0:1], s[12:13], s0
 ; GCN-NEXT:    s_lshl_b64 s[2:3], s[12:13], s2
-; GCN-NEXT:    s_add_u32 s2, s2, -1
-; GCN-NEXT:    s_addc_u32 s3, s3, -1
-; GCN-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[12:13], s0
 ; GCN-NEXT:    s_add_u32 s0, s0, -1
 ; GCN-NEXT:    s_addc_u32 s1, s1, -1
 ; GCN-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
+; GCN-NEXT:    s_add_u32 s2, s2, -1
+; GCN-NEXT:    s_addc_u32 s3, s3, -1
+; GCN-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -6249,8 +6278,13 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 
 define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
 ; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
-; CHECK-NEXT:    [[R:%.*]] = sdiv <2 x i64> [[X:%.*]], <i64 4096, i64 4096>
-; CHECK-NEXT:    store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
 ;
 ; GCN-LABEL: sdiv_v2i64_pow2k_denom:
@@ -6260,16 +6294,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s8, s3, 31
-; GCN-NEXT:    s_lshr_b32 s8, s8, 20
-; GCN-NEXT:    s_add_u32 s2, s2, s8
-; GCN-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-NEXT:    s_ashr_i32 s8, s1, 31
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
 ; GCN-NEXT:    s_lshr_b32 s8, s8, 20
 ; GCN-NEXT:    s_add_u32 s0, s0, s8
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_ashr_i32 s8, s3, 31
 ; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
+; GCN-NEXT:    s_lshr_b32 s8, s8, 20
+; GCN-NEXT:    s_add_u32 s2, s2, s8
+; GCN-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -6283,101 +6317,112 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 
 define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
 ; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
-; CHECK-NEXT:    [[R:%.*]] = sdiv <2 x i64> [[X:%.*]], <i64 4096, i64 4095>
-; CHECK-NEXT:    store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
 ;
 ; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x457ff000
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GCN-NEXT:    v_mac_f32_e32 v0, 0, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_movk_i32 s6, 0xf001
-; GCN-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, s6
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s6
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_ashr_i32 s0, s9, 31
+; GCN-NEXT:    s_lshr_b32 s0, s0, 20
+; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, s6
+; GCN-NEXT:    s_add_u32 s2, s8, s0
+; GCN-NEXT:    s_addc_u32 s3, s9, 0
+; GCN-NEXT:    s_ashr_i32 s8, s11, 31
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
-; GCN-NEXT:    v_mul_lo_u32 v8, v1, v4
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v4, vcc
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, s6
-; GCN-NEXT:    v_mul_hi_u32 v6, s6, v0
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, s6
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v6
-; GCN-NEXT:    v_mul_lo_u32 v6, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
+; GCN-NEXT:    v_mul_lo_u32 v5, v2, s6
+; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GCN-NEXT:    v_mul_lo_u32 v7, v0, s6
+; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
+; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
+; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
+; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v7, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[2:3]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s2, s11, 31
-; GCN-NEXT:    s_add_u32 s0, s10, s2
+; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
+; GCN-NEXT:    s_add_u32 s0, s10, s8
+; GCN-NEXT:    s_addc_u32 s1, s11, s8
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    s_mov_b32 s3, s2
-; GCN-NEXT:    s_addc_u32 s1, s11, s2
-; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
-; GCN-NEXT:    v_mul_hi_u32 v6, s1, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s0, v1
+; GCN-NEXT:    v_mul_hi_u32 v7, s1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GCN-NEXT:    s_movk_i32 s3, 0xfff
+; GCN-NEXT:    s_movk_i32 s9, 0xfff
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v6, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, s3
-; GCN-NEXT:    v_mul_hi_u32 v3, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s3
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, s9
+; GCN-NEXT:    v_mul_hi_u32 v3, s9, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s9
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v4
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v4
 ; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
 ; GCN-NEXT:    s_movk_i32 s0, 0xffe
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
@@ -6394,22 +6439,17 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    s_ashr_i32 s0, s9, 31
-; GCN-NEXT:    s_lshr_b32 s0, s0, 20
-; GCN-NEXT:    s_add_u32 s0, s8, s0
-; GCN-NEXT:    s_addc_u32 s1, s9, 0
-; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v0
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
+; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
+; GCN-NEXT:    v_mov_b32_e32 v3, s8
 ; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
@@ -6420,8 +6460,15 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = sdiv <2 x i64> [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
 ;
 ; GCN-LABEL: sdiv_v2i64_pow2_shl_denom:
@@ -6432,8 +6479,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    s_mov_b32 s18, 0x4f800000
 ; GCN-NEXT:    s_mov_b32 s19, 0x5f7ffffc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b64 s[12:13], s[2:3], s4
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
+; GCN-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
+; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GCN-NEXT:    s_ashr_i32 s16, s3, 31
 ; GCN-NEXT:    s_add_u32 s2, s2, s16
 ; GCN-NEXT:    s_mov_b32 s17, s16
@@ -6503,22 +6550,22 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s2, s11, 31
-; GCN-NEXT:    s_add_u32 s0, s10, s2
+; GCN-NEXT:    s_ashr_i32 s2, s9, 31
+; GCN-NEXT:    s_add_u32 s0, s8, s2
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    s_mov_b32 s3, s2
-; GCN-NEXT:    s_addc_u32 s1, s11, s2
-; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[2:3]
+; GCN-NEXT:    s_addc_u32 s1, s9, s2
+; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
+; GCN-NEXT:    v_mul_hi_u32 v5, s8, v1
+; GCN-NEXT:    v_mul_hi_u32 v7, s9, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s9, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
 ; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
@@ -6533,8 +6580,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, s14, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s11, v2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s9, v2
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
 ; GCN-NEXT:    v_subb_u32_e64 v5, s[0:1], v5, v7, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v3
 ; GCN-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
@@ -6548,14 +6595,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
 ; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v0
 ; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
-; GCN-NEXT:    s_ashr_i32 s10, s13, 31
+; GCN-NEXT:    s_ashr_i32 s8, s13, 31
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GCN-NEXT:    s_add_u32 s12, s12, s10
+; GCN-NEXT:    s_add_u32 s12, s12, s8
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v8, s11
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    s_addc_u32 s13, s13, s10
-; GCN-NEXT:    s_xor_b64 s[12:13], s[12:13], s[10:11]
+; GCN-NEXT:    v_mov_b32_e32 v8, s9
+; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_addc_u32 s13, s13, s8
+; GCN-NEXT:    s_xor_b64 s[12:13], s[12:13], s[8:9]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v10, s12
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v11, s13
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
@@ -6624,42 +6671,42 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GCN-NEXT:    s_ashr_i32 s14, s9, 31
+; GCN-NEXT:    s_ashr_i32 s14, s11, 31
 ; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
-; GCN-NEXT:    s_add_u32 s0, s8, s14
+; GCN-NEXT:    s_add_u32 s0, s10, s14
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    s_mov_b32 s15, s14
-; GCN-NEXT:    s_addc_u32 s1, s9, s14
-; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[14:15]
+; GCN-NEXT:    s_addc_u32 s1, s11, s14
+; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s8, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, s8, v2
-; GCN-NEXT:    v_mul_hi_u32 v9, s8, v3
-; GCN-NEXT:    v_mul_hi_u32 v10, s9, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, s9, v3
+; GCN-NEXT:    v_mul_lo_u32 v5, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v9, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v10, s11, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v9, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s9, v2
+; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
 ; GCN-NEXT:    v_mov_b32_e32 v8, s3
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v2, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v6, s12, v4
-; GCN-NEXT:    v_mul_hi_u32 v7, s12, v5
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v0, s13, v5
-; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v1, v8, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v7, v6
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s12, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s12, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, s13, v2
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, s12, v2
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s11, v4
 ; GCN-NEXT:    v_mov_b32_e32 v7, s13
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_mul_lo_u32 v1, s12, v5
-; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s9, v0
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s8, v1
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s10, v5
 ; GCN-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
-; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s12, v1
+; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s12, v5
 ; GCN-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
@@ -6667,30 +6714,30 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
-; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v5
-; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v4, s[0:1]
-; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v5
-; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v4, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
+; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
+; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v8, s9
-; GCN-NEXT:    v_subb_u32_e32 v0, vcc, v8, v0, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v0
+; GCN-NEXT:    v_mov_b32_e32 v8, s11
+; GCN-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v9, v7, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[10:11]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
-; GCN-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GCN-NEXT:    v_xor_b32_e32 v4, s1, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v1
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v4, v5, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GCN-NEXT:    v_xor_b32_e32 v2, s0, v2
+; GCN-NEXT:    v_xor_b32_e32 v3, s1, v3
+; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
@@ -7010,8 +7057,13 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 
 define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
 ; CHECK-LABEL: @srem_v2i64_pow2k_denom(
-; CHECK-NEXT:    [[R:%.*]] = srem <2 x i64> [[X:%.*]], <i64 4096, i64 4096>
-; CHECK-NEXT:    store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
 ;
 ; GCN-LABEL: srem_v2i64_pow2k_denom:
@@ -7022,20 +7074,20 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s9, s3, 31
-; GCN-NEXT:    s_lshr_b32 s9, s9, 20
-; GCN-NEXT:    s_add_u32 s9, s2, s9
-; GCN-NEXT:    s_addc_u32 s10, s3, 0
-; GCN-NEXT:    s_and_b32 s9, s9, s8
-; GCN-NEXT:    s_sub_u32 s2, s2, s9
-; GCN-NEXT:    s_subb_u32 s3, s3, s10
 ; GCN-NEXT:    s_ashr_i32 s9, s1, 31
 ; GCN-NEXT:    s_lshr_b32 s9, s9, 20
 ; GCN-NEXT:    s_add_u32 s9, s0, s9
 ; GCN-NEXT:    s_addc_u32 s10, s1, 0
-; GCN-NEXT:    s_and_b32 s8, s9, s8
-; GCN-NEXT:    s_sub_u32 s0, s0, s8
+; GCN-NEXT:    s_and_b32 s9, s9, s8
+; GCN-NEXT:    s_sub_u32 s0, s0, s9
 ; GCN-NEXT:    s_subb_u32 s1, s1, s10
+; GCN-NEXT:    s_ashr_i32 s9, s3, 31
+; GCN-NEXT:    s_lshr_b32 s9, s9, 20
+; GCN-NEXT:    s_add_u32 s9, s2, s9
+; GCN-NEXT:    s_addc_u32 s10, s3, 0
+; GCN-NEXT:    s_and_b32 s8, s9, s8
+; GCN-NEXT:    s_sub_u32 s2, s2, s8
+; GCN-NEXT:    s_subb_u32 s3, s3, s10
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -7050,8 +7102,15 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = srem <2 x i64> [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store <2 x i64> [[R]], <2 x i64> addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
 ;
 ; GCN-LABEL: srem_v2i64_pow2_shl_denom:
@@ -7062,8 +7121,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    s_mov_b32 s18, 0x4f800000
 ; GCN-NEXT:    s_mov_b32 s19, 0x5f7ffffc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b64 s[14:15], s[2:3], s4
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
+; GCN-NEXT:    s_lshl_b64 s[14:15], s[2:3], s6
+; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GCN-NEXT:    s_ashr_i32 s4, s3, 31
 ; GCN-NEXT:    s_add_u32 s2, s2, s4
 ; GCN-NEXT:    s_mov_b32 s5, s4
@@ -7086,8 +7145,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s12, s11, 31
-; GCN-NEXT:    s_add_u32 s0, s10, s12
+; GCN-NEXT:    s_ashr_i32 s12, s9, 31
+; GCN-NEXT:    s_add_u32 s0, s8, s12
 ; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
@@ -7104,8 +7163,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
 ; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT:    s_addc_u32 s1, s11, s12
-; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[12:13]
+; GCN-NEXT:    s_addc_u32 s1, s9, s12
+; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -7140,15 +7199,15 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
+; GCN-NEXT:    v_mul_hi_u32 v5, s8, v1
+; GCN-NEXT:    v_mul_hi_u32 v7, s9, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s9, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
@@ -7161,9 +7220,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_mul_lo_u32 v0, s16, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v0
 ; GCN-NEXT:    v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1]
@@ -7178,14 +7237,14 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    s_ashr_i32 s2, s15, 31
 ; GCN-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GCN-NEXT:    s_add_u32 s10, s14, s2
+; GCN-NEXT:    s_add_u32 s8, s14, s2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN-NEXT:    s_mov_b32 s3, s2
-; GCN-NEXT:    s_addc_u32 s11, s15, s2
-; GCN-NEXT:    s_xor_b64 s[10:11], s[10:11], s[2:3]
-; GCN-NEXT:    v_cvt_f32_u32_e32 v7, s10
-; GCN-NEXT:    v_cvt_f32_u32_e32 v9, s11
+; GCN-NEXT:    s_addc_u32 s9, s15, s2
+; GCN-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
+; GCN-NEXT:    v_cvt_f32_u32_e32 v7, s8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v9, s9
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
@@ -7204,13 +7263,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_mac_f32_e32 v3, s21, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT:    s_sub_u32 s2, 0, s10
+; GCN-NEXT:    s_sub_u32 s2, 0, s8
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v7, s2, v5
-; GCN-NEXT:    s_subb_u32 s3, 0, s11
+; GCN-NEXT:    s_subb_u32 s3, 0, s9
 ; GCN-NEXT:    v_mul_lo_u32 v8, s3, v3
-; GCN-NEXT:    s_ashr_i32 s14, s9, 31
+; GCN-NEXT:    s_ashr_i32 s14, s11, 31
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, s2, v3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
@@ -7255,68 +7314,68 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
-; GCN-NEXT:    s_add_u32 s0, s8, s14
+; GCN-NEXT:    s_add_u32 s0, s10, s14
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    s_addc_u32 s1, s9, s14
-; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[14:15]
+; GCN-NEXT:    s_addc_u32 s1, s11, s14
+; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s8, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, s8, v2
-; GCN-NEXT:    v_mul_hi_u32 v9, s8, v3
-; GCN-NEXT:    v_mul_hi_u32 v10, s9, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, s9, v3
+; GCN-NEXT:    v_mul_lo_u32 v5, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v9, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v10, s11, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; GCN-NEXT:    v_mul_lo_u32 v9, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s9, v2
+; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
 ; GCN-NEXT:    v_mov_b32_e32 v8, s12
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v2, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v6, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, s10, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, s10, v5
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s12, v0
-; GCN-NEXT:    v_mul_lo_u32 v0, s11, v5
-; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v1, v8, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v6, v4
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_mul_lo_u32 v1, s10, v5
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s9, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s8, v1
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, s8, v3
+; GCN-NEXT:    v_mul_hi_u32 v4, s8, v2
+; GCN-NEXT:    v_mul_lo_u32 v5, s9, v2
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s8, v2
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v3
+; GCN-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s10, v1
+; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
 ; GCN-NEXT:    v_subb_u32_e64 v5, s[2:3], v4, v5, s[0:1]
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v8, s[0:1], s10, v6
+; GCN-NEXT:    v_subrev_i32_e64 v8, s[0:1], s8, v6
 ; GCN-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v5, s9
-; GCN-NEXT:    v_subb_u32_e32 v0, vcc, v5, v0, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v1, s14, v1
-; GCN-NEXT:    v_xor_b32_e32 v4, s14, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s14
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v1
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v4, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_xor_b32_e32 v2, s14, v2
+; GCN-NEXT:    v_xor_b32_e32 v3, s14, v3
+; GCN-NEXT:    v_mov_b32_e32 v4, s14
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index 473dc6050930d..74fe04bcf3473 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -3,6 +3,9 @@
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,VI
 
 declare i16 @llvm.bswap.i16(i16) nounwind readnone
+declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone
+declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) nounwind readnone
+declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) nounwind readnone
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
@@ -10,6 +13,7 @@ declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone
 declare i64 @llvm.bswap.i64(i64) nounwind readnone
 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
+declare i48 @llvm.bswap.i48(i48) #1
 
 define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
 ; SI-LABEL: test_bswap_i32:
@@ -370,9 +374,9 @@ define float @missing_truncate_promote_bswap(i32 %arg) {
 ; VI-LABEL: missing_truncate_promote_bswap:
 ; VI:       ; %bb.0: ; %bb
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, 0x10203
+; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
-; VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = trunc i32 %arg to i16
@@ -381,3 +385,197 @@ bb:
   %tmp3 = fpext half %tmp2 to float
   ret float %tmp3
 }
+
+define i16 @v_bswap_i16(i16 %src) {
+; SI-LABEL: v_bswap_i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
+; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; SI-NEXT:    s_mov_b32 s4, 0xff00ff
+; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bswap_i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
+; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call i16 @llvm.bswap.i16(i16 %src)
+  ret i16 %bswap
+}
+
+define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
+; SI-LABEL: v_bswap_i16_zext_to_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
+; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; SI-NEXT:    s_mov_b32 s4, 0xff00ff
+; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bswap_i16_zext_to_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
+; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call i16 @llvm.bswap.i16(i16 %src)
+  %zext = zext i16 %bswap to i32
+  ret i32 %zext
+}
+
+define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
+; SI-LABEL: v_bswap_i16_sext_to_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
+; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; SI-NEXT:    s_mov_b32 s4, 0xff00ff
+; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bswap_i16_sext_to_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
+; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call i16 @llvm.bswap.i16(i16 %src)
+  %zext = sext i16 %bswap to i32
+  ret i32 %zext
+}
+
+define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
+; SI-LABEL: v_bswap_v2i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_alignbit_b32 v2, v1, v1, 8
+; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
+; SI-NEXT:    s_mov_b32 s4, 0xff00ff
+; SI-NEXT:    v_alignbit_b32 v3, v0, v0, 8
+; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; SI-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; SI-NEXT:    v_bfi_b32 v0, s4, v0, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bswap_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x2030001
+; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
+  ret <2 x i16> %bswap
+}
+
+define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
+; SI-LABEL: v_bswap_v3i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_alignbit_b32 v3, v1, v1, 8
+; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
+; SI-NEXT:    s_mov_b32 s4, 0xff00ff
+; SI-NEXT:    v_alignbit_b32 v4, v0, v0, 8
+; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; SI-NEXT:    v_alignbit_b32 v5, v2, v2, 8
+; SI-NEXT:    v_alignbit_b32 v2, v2, v2, 24
+; SI-NEXT:    v_bfi_b32 v1, s4, v1, v3
+; SI-NEXT:    v_bfi_b32 v0, s4, v0, v4
+; SI-NEXT:    v_bfi_b32 v2, s4, v2, v5
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bswap_v3i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x2030001
+; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
+; VI-NEXT:    v_perm_b32 v1, 0, v1, s4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src)
+  ret <3 x i16> %bswap
+}
+
+define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
+; SI-LABEL: v_bswap_v4i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_alignbit_b32 v4, v3, v3, 8
+; SI-NEXT:    v_alignbit_b32 v3, v3, v3, 24
+; SI-NEXT:    s_mov_b32 s4, 0xff00ff
+; SI-NEXT:    s_mov_b32 s5, 0xffff0000
+; SI-NEXT:    v_alignbit_b32 v5, v2, v2, 8
+; SI-NEXT:    v_alignbit_b32 v2, v2, v2, 24
+; SI-NEXT:    v_alignbit_b32 v6, v1, v1, 8
+; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
+; SI-NEXT:    v_alignbit_b32 v7, v0, v0, 8
+; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; SI-NEXT:    v_bfi_b32 v3, s4, v3, v4
+; SI-NEXT:    v_bfi_b32 v2, s4, v2, v5
+; SI-NEXT:    v_bfi_b32 v1, s4, v1, v6
+; SI-NEXT:    v_bfi_b32 v0, s4, v0, v7
+; SI-NEXT:    v_and_b32_e32 v3, s5, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_and_b32_e32 v1, s5, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bswap_v4i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x2030001
+; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
+; VI-NEXT:    v_perm_b32 v1, 0, v1, s4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %src)
+  ret <4 x i16> %bswap
+}
+
+define i64 @v_bswap_i48(i64 %src) {
+; SI-LABEL: v_bswap_i48:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_alignbit_b32 v2, v0, v0, 8
+; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
+; SI-NEXT:    s_mov_b32 s4, 0xff00ff
+; SI-NEXT:    v_alignbit_b32 v3, v1, v1, 8
+; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
+; SI-NEXT:    v_bfi_b32 v2, s4, v0, v2
+; SI-NEXT:    v_bfi_b32 v0, s4, v1, v3
+; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bswap_i48:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x10203
+; VI-NEXT:    v_perm_b32 v2, 0, v0, s4
+; VI-NEXT:    v_perm_b32 v0, 0, v1, s4
+; VI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %trunc = trunc i64 %src to i48
+  %bswap = call i48 @llvm.bswap.i48(i48 %trunc)
+  %zext = zext i48 %bswap to i64
+  ret i64 %zext
+}
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index cd6cecaa4ad7b..29f73d6b37b81 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
 define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv:
@@ -139,6 +140,111 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v5, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
+; GCN-IR-NEXT:    s_ashr_i32 s8, s13, 31
+; GCN-IR-NEXT:    s_mov_b32 s3, s2
+; GCN-IR-NEXT:    s_mov_b32 s9, s8
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s10, s0, s2
+; GCN-IR-NEXT:    s_subb_u32 s11, s1, s2
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s10
+; GCN-IR-NEXT:    s_sub_u32 s6, s0, s8
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s11
+; GCN-IR-NEXT:    s_subb_u32 s7, s1, s8
+; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s6
+; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    s_add_i32 s15, s15, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s16, s7
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s11, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s16
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v1, v0
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[12:13], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB0_7
+; GCN-IR-NEXT:  BB0_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[10:11], v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB0_6
+; GCN-IR-NEXT:  BB0_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[10:11], v0
+; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB0_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s11
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s10, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s6, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s7, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
+; GCN-IR-NEXT:  BB0_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB0_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[8:9], s[2:3]
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -272,6 +378,105 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v8, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_sdiv:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v5
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v4, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v4, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v3, v5, v3
+; GCN-IR-NEXT:    v_xor_b32_e32 v2, v5, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, v0, v4
+; GCN-IR-NEXT:    v_subb_u32_e32 v13, vcc, v1, v4, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v5
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v3, v5, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[12:13]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v8, v12
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v9, v13
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 32, v8
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v2, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[8:9]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v13, 0, s[6:7]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v12, 0, s[6:7]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB1_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v8
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, 63, v8
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[8:9]
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[12:13], v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB1_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[16:17], v[12:13], v2
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v16, v16, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v11, v15, v11
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v14, v10
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v12, v16
+; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], v13, v17, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v14
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v16, vcc, v16, v14
+; GCN-IR-NEXT:    v_subb_u32_e32 v17, vcc, v17, v15, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, v8
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB1_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[10:11], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v9, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v8, v0
+; GCN-IR-NEXT:  BB1_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v7, v6
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v5, v4
+; GCN-IR-NEXT:    v_xor_b32_e32 v2, v2, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v3, v3, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %x, %y
   ret i64 %result
 }
@@ -289,23 +494,53 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 40
 ; GCN-NEXT:    s_ashr_i64 s[6:7], s[8:9], 40
 ; GCN-NEXT:    s_xor_b32 s5, s4, s6
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GCN-NEXT:    s_ashr_i32 s4, s5, 30
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
 ; GCN-NEXT:    s_or_b32 s4, s4, 1
-; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv24_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 40
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[8:9], 40
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-IR-NEXT:    s_ashr_i32 s4, s5, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = sdiv i64 %1, %2
@@ -319,31 +554,36 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v2, v1
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v1
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v4
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GCN-NEXT:    v_mul_hi_u32 v3, v3, v2
-; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v2, v3
-; GCN-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, -1, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; GCN-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v3
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_sdiv24_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
   %result = sdiv i64 %1, %2
@@ -354,47 +594,56 @@ define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_sdiv32_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_ashr_i32 s4, s7, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s3
-; GCN-NEXT:    s_add_i32 s0, s7, s4
-; GCN-NEXT:    s_xor_b32 s5, s2, s3
-; GCN-NEXT:    s_xor_b32 s2, s0, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s5
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s5
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s5
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s2, v1
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
-; GCN-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
-; GCN-NEXT:    s_xor_b32 s0, s4, s3
-; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_xor_b32 s4, s7, s8
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s7
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s8
+; GCN-NEXT:    s_ashr_i32 s4, s4, 30
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv32_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_xor_b32 s4, s7, s8
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s7
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s8
+; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 32
   %2 = ashr i64 %y, 32
   %result = sdiv i64 %1, %2
@@ -406,49 +655,62 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_sdiv31_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[6:7], 33
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_ashr_i32 s4, s0, 31
-; GCN-NEXT:    s_add_i32 s1, s2, s3
-; GCN-NEXT:    s_add_i32 s0, s0, s4
-; GCN-NEXT:    s_xor_b32 s5, s1, s3
-; GCN-NEXT:    s_xor_b32 s2, s0, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s5
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s5
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s5
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s2, v1
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
-; GCN-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
-; GCN-NEXT:    s_xor_b32 s0, s4, s3
-; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 33
+; GCN-NEXT:    s_ashr_i64 s[6:7], s[8:9], 33
+; GCN-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-NEXT:    s_ashr_i32 s4, s5, 30
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv31_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 33
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[8:9], 33
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-IR-NEXT:    s_ashr_i32 s4, s5, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 31
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = sdiv i64 %1, %2
@@ -469,23 +731,53 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 41
 ; GCN-NEXT:    s_ashr_i64 s[6:7], s[8:9], 41
 ; GCN-NEXT:    s_xor_b32 s5, s4, s6
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GCN-NEXT:    s_ashr_i32 s4, s5, 30
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
 ; GCN-NEXT:    s_or_b32 s4, s4, 1
-; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv23_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 41
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[8:9], 41
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-IR-NEXT:    s_ashr_i32 s4, s5, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 23
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 41
   %2 = ashr i64 %y, 41
   %result = sdiv i64 %1, %2
@@ -497,49 +789,62 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_sdiv25_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[6:7], 39
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_ashr_i32 s4, s0, 31
-; GCN-NEXT:    s_add_i32 s1, s2, s3
-; GCN-NEXT:    s_add_i32 s0, s0, s4
-; GCN-NEXT:    s_xor_b32 s5, s1, s3
-; GCN-NEXT:    s_xor_b32 s2, s0, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s5
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s5
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s5
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s2, v1
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
-; GCN-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
-; GCN-NEXT:    s_xor_b32 s0, s4, s3
-; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 39
+; GCN-NEXT:    s_ashr_i64 s[6:7], s[8:9], 39
+; GCN-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-NEXT:    s_ashr_i32 s4, s5, 30
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv25_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 39
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[8:9], 39
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-IR-NEXT:    s_ashr_i32 s4, s5, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 39
   %2 = ashr i64 %y, 39
   %result = sdiv i64 %1, %2
@@ -556,10 +861,10 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[8:9], 40
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[10:11], 40
-; GCN-NEXT:    s_ashr_i64 s[8:9], s[12:13], 40
-; GCN-NEXT:    s_ashr_i64 s[10:11], s[14:15], 40
+; GCN-NEXT:    s_ashr_i64 s[0:1], s[10:11], 40
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[8:9], 40
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[14:15], 40
+; GCN-NEXT:    s_ashr_i64 s[10:11], s[12:13], 40
 ; GCN-NEXT:    s_xor_b32 s1, s2, s10
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s10
@@ -588,12 +893,59 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GCN-NEXT:    v_bfe_i32 v2, v0, 0, 24
-; GCN-NEXT:    v_bfe_i32 v0, v1, 0, 24
-; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-NEXT:    v_bfe_i32 v2, v1, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv24_v2i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x11
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[10:11], 40
+; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[8:9], 40
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[14:15], 40
+; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[12:13], 40
+; GCN-IR-NEXT:    s_xor_b32 s1, s2, s10
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s10
+; GCN-IR-NEXT:    s_xor_b32 s2, s0, s8
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v2, s0
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, s8
+; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GCN-IR-NEXT:    s_ashr_i32 s1, s2, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v4, v0, v4
+; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v5, v2, v5
+; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-IR-NEXT:    v_trunc_f32_e32 v5, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, s1
+; GCN-IR-NEXT:    v_mad_f32 v0, -v4, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_mad_f32 v2, -v5, v3, v2
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v3|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_bfe_i32 v2, v1, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr <2 x i64> %x, <i64 40, i64 40>
   %2 = ashr <2 x i64> %y, <i64 40, i64 40>
   %result = sdiv <2 x i64> %1, %2
@@ -636,6 +988,119 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv24_48:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xc
+; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xd
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-IR-NEXT:    s_sext_i32_i16 s9, s6
+; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[2:3], 24
+; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
+; GCN-IR-NEXT:    s_ashr_i32 s6, s9, 31
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[8:9], 24
+; GCN-IR-NEXT:    s_mov_b32 s3, s2
+; GCN-IR-NEXT:    s_mov_b32 s7, s6
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s10, s0, s2
+; GCN-IR-NEXT:    s_subb_u32 s11, s1, s2
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s10
+; GCN-IR-NEXT:    s_sub_u32 s8, s0, s6
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s11
+; GCN-IR-NEXT:    s_subb_u32 s9, s1, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s8
+; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[8:9], 0
+; GCN-IR-NEXT:    s_add_i32 s15, s15, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s16, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s11, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s16
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v1, v0
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[12:13], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT:    s_cbranch_vccz BB9_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB9_7
+; GCN-IR-NEXT:  BB9_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[10:11], v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB9_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB9_6
+; GCN-IR-NEXT:  BB9_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[10:11], v0
+; GCN-IR-NEXT:    s_add_u32 s10, s8, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB9_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s11
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s10, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s8, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s9, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB9_5
+; GCN-IR-NEXT:  BB9_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB9_7: ; %udiv-end
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[6:7], s[2:3]
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
   %result = sdiv i48 %1, %2
@@ -769,6 +1234,93 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv_k_num_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
+; GCN-IR-NEXT:    s_mov_b32 s3, s2
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s8, s0, s2
+; GCN-IR-NEXT:    s_subb_u32 s9, s1, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s8
+; GCN-IR-NEXT:    s_add_i32 s0, s0, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s1, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc5, v0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[8:9], 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_cbranch_vccz BB10_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB10_7
+; GCN-IR-NEXT:  BB10_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], 24, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB10_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB10_6
+; GCN-IR-NEXT:  BB10_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v0
+; GCN-IR-NEXT:    s_add_u32 s7, s8, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s10, s9, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB10_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s10
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s7, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s8, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s9, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB10_5
+; GCN-IR-NEXT:  BB10_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB10_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 24, %x
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -892,6 +1444,91 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_sdiv_k_num_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GCN-IR-NEXT:    s_movk_i32 s4, 0xffc5
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, s4, v4
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, 24, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB11_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 63, v6
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], 24, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB11_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[14:15], 24, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:  BB11_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, -1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, -1, v5, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v14, v14, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v9, v13, v9
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v12, v8
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v10, v14
+; GCN-IR-NEXT:    v_subb_u32_e64 v6, s[4:5], v11, v15, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v14, vcc, v14, v12
+; GCN-IR-NEXT:    v_subb_u32_e32 v15, vcc, v15, v13, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB11_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB11_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[8:9], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v0
+; GCN-IR-NEXT:  BB11_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v5, v3
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v4, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 24, %x
   ret i64 %result
 }
@@ -1015,6 +1652,96 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_sdiv_pow2_k_num_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR-NEXT:    s_movk_i32 s4, 0xffd0
+; GCN-IR-NEXT:    s_mov_b32 s10, 0x8000
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v6, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, s4, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB12_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 63, v6
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], s[10:11], v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB12_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    s_mov_b32 s5, 0
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x8000
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_lshr_b64 v[14:15], s[4:5], v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:  BB12_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, -1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, -1, v5, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v14, v14, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v13, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v12, v6
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v10, v14
+; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], v11, v15, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v14, vcc, v14, v12
+; GCN-IR-NEXT:    v_subb_u32_e32 v15, vcc, v15, v13, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v8
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB12_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB12_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[6:7], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v9, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v8, v0
+; GCN-IR-NEXT:  BB12_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v5, v3
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v4, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 32768, %x
   ret i64 %result
 }
@@ -1029,6 +1756,89 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_sdiv_pow2_k_den_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v8
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v1, v9
+; GCN-IR-NEXT:    v_add_i32_e64 v0, s[4:5], 32, v0
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 48, v0
+; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v9, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v8, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB13_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v4
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[8:9], v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB13_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[8:9], v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT:  BB13_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v9, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v8, v6
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s12, v10
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v11, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 0x8000, v8
+; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v11, v12, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB13_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB13_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[6:7], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v5, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v4, v0
+; GCN-IR-NEXT:  BB13_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %x, 32768
   ret i64 %result
 }
@@ -1044,22 +1854,49 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_ashr_i64 s[0:1], s[2:3], 40
+; GCN-NEXT:    s_ashr_i32 s1, s0, 30
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-NEXT:    s_or_b32 s0, s1, 1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mad_f32 v3, -v1, v0, s2
+; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv24_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s2, 0x41c00000
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[2:3], 40
+; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 30
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-IR-NEXT:    s_or_b32 s0, s1, 1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mad_f32 v3, -v1, v0, s2
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 24, %x.shr
   store i64 %result, i64 addrspace(1)* %out
@@ -1077,21 +1914,47 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_ashr_i64 s[0:1], s[2:3], 40
+; GCN-NEXT:    s_ashr_i32 s1, s0, 30
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-NEXT:    s_or_b32 s0, s1, 1
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
-; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_mad_f32 v0, -v1, s2, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_sdiv24_k_den_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s2, 0x46b6fe00
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[2:3], 40
+; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 30
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-IR-NEXT:    s_or_b32 s0, s1, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s2, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s2
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 %x.shr, 23423
   store i64 %result, i64 addrspace(1)* %out
@@ -1104,20 +1967,40 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
-; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 30, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_or_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mad_f32 v3, -v2, v1, s4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT:    v_mad_f32 v3, -v2, v0, s4
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_sdiv24_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 30, v0
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v0, s4
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 24, %x.shr
   ret i64 %result
@@ -1129,20 +2012,40 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
-; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 30, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_or_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mad_f32 v3, -v2, v1, s4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT:    v_mad_f32 v3, -v2, v0, s4
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_sdiv24_pow2_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 30, v0
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v0, s4
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 32768, %x.shr
   ret i64 %result
@@ -1158,6 +2061,25 @@ define i64 @v_test_sdiv24_pow2_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_sdiv24_pow2_k_den_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 30, v0
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x38000000, v0
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, s4, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 %x.shr, 32768
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 9113f6c2e6385..73da5d42e15bb 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
 define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem:
@@ -122,6 +123,106 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s2
+; GCN-IR-NEXT:    s_add_i32 s11, s0, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s3
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-IR-NEXT:    s_add_i32 s8, s10, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB0_7
+; GCN-IR-NEXT:  BB0_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[6:7], v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB0_6
+; GCN-IR-NEXT:  BB0_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[6:7], v0
+; GCN-IR-NEXT:    s_add_u32 s8, s2, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s9, s3, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB0_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s9
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s8, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s2, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s3, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
+; GCN-IR-NEXT:  BB0_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB0_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s10, -1
+; GCN-IR-NEXT:    s_mov_b32 s8, s4
+; GCN-IR-NEXT:    s_mov_b32 s9, s5
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -253,6 +354,110 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_srem:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GCN-IR-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GCN-IR-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v6, v2
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v3
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v8, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v9, v1
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 32, v6
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 32, v8
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v6, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[8:9]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v1, 0, s[6:7]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[6:7]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB1_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v8
+; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, 63, v8
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[8:9]
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[0:1], v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB1_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[16:17], v[0:1], v6
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, -1, v6
+; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, -1, v7, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v16, v16, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v11, v15, v11
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v14, v10
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v12, v16
+; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], v13, v17, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v14
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v16, vcc, v16, v14
+; GCN-IR-NEXT:    v_subb_u32_e32 v17, vcc, v17, v15, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, v8
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB1_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[10:11], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v9, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v8, v6
+; GCN-IR-NEXT:  BB1_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_mul_lo_u32 v7, v2, v7
+; GCN-IR-NEXT:    v_mul_hi_u32 v8, v2, v6
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, v3, v6
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, v6
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, v8, v7
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %x, %y
   ret i64 %result
 }
@@ -270,25 +475,57 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 41
 ; GCN-NEXT:    s_ashr_i64 s[6:7], s[8:9], 41
 ; GCN-NEXT:    s_xor_b32 s5, s4, s6
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GCN-NEXT:    s_ashr_i32 s5, s5, 30
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
 ; GCN-NEXT:    s_or_b32 s5, s5, 1
-; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem23_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 41
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[8:9], 41
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-IR-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    s_or_b32 s5, s5, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 23
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 41
   %2 = ashr i64 %y, 41
   %result = srem i64 %1, %2
@@ -309,25 +546,57 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 40
 ; GCN-NEXT:    s_ashr_i64 s[6:7], s[8:9], 40
 ; GCN-NEXT:    s_xor_b32 s5, s4, s6
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GCN-NEXT:    s_ashr_i32 s5, s5, 30
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
 ; GCN-NEXT:    s_or_b32 s5, s5, 1
-; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem24_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 40
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[8:9], 40
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-IR-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    s_or_b32 s5, s5, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = srem i64 %1, %2
@@ -350,15 +619,39 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
 ; GCN-NEXT:    v_mad_f32 v3, -v5, v4, v3
+; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v5
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_srem24_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
+; GCN-IR-NEXT:    v_ashr_i64 v[1:2], v[2:3], 40
+; GCN-IR-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v0
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v4, v1
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v5, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v2, 1, v2
+; GCN-IR-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GCN-IR-NEXT:    v_trunc_f32_e32 v5, v5
+; GCN-IR-NEXT:    v_mad_f32 v3, -v5, v4, v3
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = srem i64 %1, %2
@@ -369,48 +662,66 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_srem25_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[6:7], 39
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-NEXT:    s_ashr_i32 s1, s2, 31
-; GCN-NEXT:    s_ashr_i32 s4, s0, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s1
-; GCN-NEXT:    s_add_i32 s0, s0, s4
-; GCN-NEXT:    s_xor_b32 s5, s2, s1
-; GCN-NEXT:    s_xor_b32 s2, s0, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s5
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s5
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s5, v1
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s2, v0
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v1
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v1
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 39
+; GCN-NEXT:    s_ashr_i64 s[6:7], s[8:9], 39
+; GCN-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    s_or_b32 s5, s5, 1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem25_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 39
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[8:9], 39
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-IR-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    s_or_b32 s5, s5, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 39
   %2 = ashr i64 %y, 39
   %result = srem i64 %1, %2
@@ -422,48 +733,66 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_srem31_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[6:7], 33
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-NEXT:    s_ashr_i32 s1, s2, 31
-; GCN-NEXT:    s_ashr_i32 s4, s0, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s1
-; GCN-NEXT:    s_add_i32 s0, s0, s4
-; GCN-NEXT:    s_xor_b32 s5, s2, s1
-; GCN-NEXT:    s_xor_b32 s2, s0, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s5
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s5
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s5, v1
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s2, v0
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v1
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v1
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 33
+; GCN-NEXT:    s_ashr_i64 s[6:7], s[8:9], 33
+; GCN-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    s_or_b32 s5, s5, 1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem31_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 33
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[8:9], 33
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s6
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
+; GCN-IR-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    s_or_b32 s5, s5, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 31
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = srem i64 %1, %2
@@ -476,46 +805,60 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_srem32_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    s_ashr_i32 s0, s2, 31
-; GCN-NEXT:    s_ashr_i32 s4, s7, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s0
-; GCN-NEXT:    s_add_i32 s1, s7, s4
-; GCN-NEXT:    s_xor_b32 s5, s2, s0
-; GCN-NEXT:    s_xor_b32 s2, s1, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s5
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s5
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s5, v1
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s2, v0
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v1
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v1
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_xor_b32 s4, s7, s8
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s7
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s8
+; GCN-NEXT:    s_ashr_i32 s4, s4, 30
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s7, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem32_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_xor_b32 s4, s7, s8
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s7
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s8
+; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s7, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 32
   %2 = ashr i64 %y, 32
   %result = srem i64 %1, %2
@@ -662,6 +1005,121 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v3, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem33_64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[6:7], 31
+; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
+; GCN-IR-NEXT:    s_ashr_i32 s6, s9, 31
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[8:9], 31
+; GCN-IR-NEXT:    s_mov_b32 s3, s2
+; GCN-IR-NEXT:    s_mov_b32 s7, s6
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], s[8:9], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s8, s0, s2
+; GCN-IR-NEXT:    s_subb_u32 s9, s1, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s8
+; GCN-IR-NEXT:    s_sub_u32 s10, s10, s6
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[8:9], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s9
+; GCN-IR-NEXT:    s_subb_u32 s11, s11, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s10
+; GCN-IR-NEXT:    s_add_i32 s14, s7, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], 0
+; GCN-IR-NEXT:    s_add_i32 s12, s13, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s11
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s11, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v1, v0
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[6:7], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_vccz BB8_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB8_7
+; GCN-IR-NEXT:  BB8_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[8:9], v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB8_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB8_6
+; GCN-IR-NEXT:  BB8_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[8:9], v0
+; GCN-IR-NEXT:    s_add_u32 s6, s10, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s7, s11, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB8_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s7
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s6, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s10, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s11, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB8_5
+; GCN-IR-NEXT:  BB8_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB8_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s10, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s10, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s11, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s10, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 31
   %2 = ashr i64 %y, 31
   %result = srem i64 %1, %2
@@ -706,6 +1164,127 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem24_48:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dword s7, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xc
+; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xd
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-IR-NEXT:    s_sext_i32_i16 s7, s7
+; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[2:3], 24
+; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
+; GCN-IR-NEXT:    s_ashr_i32 s10, s7, 31
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 24
+; GCN-IR-NEXT:    s_mov_b32 s3, s2
+; GCN-IR-NEXT:    s_mov_b32 s11, s10
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GCN-IR-NEXT:    s_sub_u32 s8, s0, s2
+; GCN-IR-NEXT:    s_subb_u32 s9, s1, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s8
+; GCN-IR-NEXT:    s_sub_u32 s6, s6, s10
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[8:9], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s9
+; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s6
+; GCN-IR-NEXT:    s_add_i32 s14, s11, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
+; GCN-IR-NEXT:    s_add_i32 s12, s13, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v1, v0
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[10:11], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[10:11]
+; GCN-IR-NEXT:    s_cbranch_vccz BB9_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB9_7
+; GCN-IR-NEXT:  BB9_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[8:9], v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB9_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB9_6
+; GCN-IR-NEXT:  BB9_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[8:9], v0
+; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB9_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s11
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s10, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s6, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s7, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB9_5
+; GCN-IR-NEXT:  BB9_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB9_7: ; %udiv-end
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s6, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
   %result = srem i48 %1, %2
@@ -833,6 +1412,96 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem_k_num_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_ashr_i32 s0, s7, 31
+; GCN-IR-NEXT:    s_mov_b32 s1, s0
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[6:7], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s2, s2, s0
+; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s2
+; GCN-IR-NEXT:    s_add_i32 s0, s0, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s1, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc5, v0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_cbranch_vccz BB10_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB10_7
+; GCN-IR-NEXT:  BB10_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], 24, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB10_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB10_6
+; GCN-IR-NEXT:  BB10_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v0
+; GCN-IR-NEXT:    s_add_u32 s7, s2, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s8, s3, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB10_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s8
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s7, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s2, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s3, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB10_5
+; GCN-IR-NEXT:  BB10_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB10_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %result = srem i64 24, %x
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -951,6 +1620,94 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_srem_k_num_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GCN-IR-NEXT:    s_movk_i32 s4, 0xffc5
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s4, v2
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, 24, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB11_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v4
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], 24, v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB11_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[12:13], 24, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:  BB11_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v11, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v10, v6
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], v8, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v4, s[4:5], v9, v13, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, v12, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v13, vcc, v13, v11, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB11_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB11_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[6:7], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v5, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v4, v2
+; GCN-IR-NEXT:  BB11_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, v0, v3
+; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v4, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 24, %x
   ret i64 %result
 }
@@ -1069,6 +1826,99 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_srem_pow2_k_num_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR-NEXT:    s_movk_i32 s4, 0xffd0
+; GCN-IR-NEXT:    s_mov_b32 s10, 0x8000
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s10
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s4, v2
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v3, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB12_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v4
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[10:11], v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB12_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    s_mov_b32 s5, 0
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x8000
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_lshr_b64 v[12:13], s[4:5], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:  BB12_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v8, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v6, s[4:5], v9, v13, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, v12, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v13, vcc, v13, v11, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB12_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB12_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v7, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v6, v2
+; GCN-IR-NEXT:  BB12_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, v0, v3
+; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v4, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
+; GCN-IR-NEXT:    v_add_i32_e64 v1, s[4:5], v2, v1
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 32768, %x
   ret i64 %result
 }
@@ -1085,6 +1935,92 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_srem_pow2_k_den_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
+; GCN-IR-NEXT:    v_add_i32_e64 v4, s[4:5], 32, v4
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], 48, v4
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v1, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v0, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB13_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 63, v6
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[0:1], v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB13_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT:  BB13_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, -1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, -1, v5, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v9, v11, v9
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v10, v8
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, s12, v12
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, 0, v13, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v10, 0x8000, v10
+; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, v12, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v13, vcc, v13, v14, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB13_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB13_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[8:9], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:  BB13_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 15
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %x, 32768
   ret i64 %result
 }
@@ -1100,17 +2036,17 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_ashr_i64 s[0:1], s[2:3], 40
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-NEXT:    s_ashr_i32 s1, s0, 30
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-NEXT:    s_or_b32 s1, s1, 1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mad_f32 v3, -v1, v0, s2
+; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
@@ -1118,6 +2054,35 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem24_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s2, 0x41c00000
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[2:3], 40
+; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 30
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mad_f32 v3, -v1, v0, s2
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = srem i64 24, %x.shr
   store i64 %result, i64 addrspace(1)* %out
@@ -1135,10 +2100,10 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_ashr_i64 s[0:1], s[2:3], 40
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-NEXT:    s_ashr_i32 s1, s0, 30
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-NEXT:    s_or_b32 s1, s1, 1
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_mad_f32 v0, -v1, s2, v0
@@ -1153,6 +2118,35 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_srem24_k_den_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s2, 0x46b6fe00
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[2:3], 40
+; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 30
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s2, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s2
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    s_movk_i32 s1, 0x5b7f
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = srem i64 %x.shr, 23423
   store i64 %result, i64 addrspace(1)* %out
@@ -1165,22 +2159,44 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
-; GCN-NEXT:    v_ashrrev_i32_e32 v2, 30, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
-; GCN-NEXT:    v_or_b32_e32 v2, 1, v2
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 30, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GCN-NEXT:    v_or_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, s4, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mad_f32 v4, -v3, v1, s4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v1|
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT:    v_mad_f32 v4, -v3, v2, s4
+; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_srem24_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 30, v0
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GCN-IR-NEXT:    v_mul_f32_e32 v3, s4, v3
+; GCN-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-IR-NEXT:    v_mad_f32 v4, -v3, v2, s4
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = srem i64 24, %x.shr
   ret i64 %result
@@ -1192,22 +2208,44 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
-; GCN-NEXT:    v_ashrrev_i32_e32 v2, 30, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
-; GCN-NEXT:    v_or_b32_e32 v2, 1, v2
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 30, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GCN-NEXT:    v_or_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, s4, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mad_f32 v4, -v3, v1, s4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v1|
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT:    v_mad_f32 v4, -v3, v2, s4
+; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_srem24_pow2_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 30, v0
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GCN-IR-NEXT:    v_mul_f32_e32 v3, s4, v3
+; GCN-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-IR-NEXT:    v_mad_f32 v4, -v3, v2, s4
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = srem i64 32768, %x.shr
   ret i64 %result
@@ -1225,6 +2263,27 @@ define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_srem24_pow2_k_den_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 30, v0
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v3, 0x38000000, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-IR-NEXT:    v_mad_f32 v2, -v3, s4, v2
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, s4
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-IR-NEXT:    v_lshlrev_b32_e32 v1, 15, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = srem i64 %x.shr, 32768
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index c4795f1769dec..375fdc6163aa0 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
 define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv_i64:
@@ -123,6 +124,95 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_udiv_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s2
+; GCN-IR-NEXT:    s_add_i32 s11, s0, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s3
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-IR-NEXT:    s_add_i32 s8, s10, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB0_7
+; GCN-IR-NEXT:  BB0_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[6:7], v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB0_6
+; GCN-IR-NEXT:  BB0_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[6:7], v0
+; GCN-IR-NEXT:    s_add_u32 s6, s2, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s7, s3, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB0_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s7
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s6, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s2, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s3, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
+; GCN-IR-NEXT:  BB0_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB0_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -241,6 +331,89 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v13, v11, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_udiv_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v6, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v1
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 32, v6
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v4, v5
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v1, 0, s[6:7]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v0, 0, s[6:7]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB1_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 63, v6
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[0:1], v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB1_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v4
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, -1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, -1, v5, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v9, v11, v9
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v10, v8
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v0, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v6, s[4:5], v1, v13, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, v12, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v13, vcc, v13, v11, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB1_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[8:9], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v7, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v6, v0
+; GCN-IR-NEXT:  BB1_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v4
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, %y
   ret i64 %result
 }
@@ -249,40 +422,52 @@ define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_udiv24_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    s_lshr_b32 s3, s7, 8
-; GCN-NEXT:    s_lshr_b32 s2, s2, 8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s3
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s3, v1
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s3, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
-; GCN-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_lshr_b32 s4, s7, 8
+; GCN-NEXT:    s_lshr_b32 s5, s8, 8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_udiv24_64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 8
+; GCN-IR-NEXT:    s_lshr_b32 s5, s8, 8
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
   %result = udiv i64 %1, %2
@@ -296,31 +481,36 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v2, v1
-; GCN-NEXT:    v_mul_lo_u32 v4, v2, v1
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v4
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GCN-NEXT:    v_mul_hi_u32 v3, v3, v2
-; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v2, v3
-; GCN-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, -1, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; GCN-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v3
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_udiv24_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
   %result = udiv i64 %1, %2
@@ -331,38 +521,46 @@ define amdgpu_kernel void @s_test_udiv32_i64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_udiv32_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s7
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s7, v1
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s7, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
-; GCN-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_udiv32_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 32
   %2 = lshr i64 %y, 32
   %result = udiv i64 %1, %2
@@ -374,40 +572,52 @@ define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_udiv31_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    s_lshr_b32 s3, s7, 1
-; GCN-NEXT:    s_lshr_b32 s2, s2, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s3
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s3, v1
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s3, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
-; GCN-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_lshr_b32 s4, s7, 1
+; GCN-NEXT:    s_lshr_b32 s5, s8, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_udiv31_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 1
+; GCN-IR-NEXT:    s_lshr_b32 s5, s8, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = udiv i64 %1, %2
@@ -422,24 +632,49 @@ define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_lshr_b32 s4, s7, 9
 ; GCN-NEXT:    s_lshr_b32 s5, s8, 9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_udiv23_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 9
+; GCN-IR-NEXT:    s_lshr_b32 s5, s8, 9
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = udiv i64 %1, %2
@@ -570,6 +805,107 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; GCN-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_udiv24_i48:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xc
+; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dword s7, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s8, 0xffff
+; GCN-IR-NEXT:    s_mov_b32 s9, 0xff000000
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_and_b32 s1, s3, s8
+; GCN-IR-NEXT:    s_and_b32 s0, s2, s9
+; GCN-IR-NEXT:    s_and_b32 s3, s7, s8
+; GCN-IR-NEXT:    s_and_b32 s2, s6, s9
+; GCN-IR-NEXT:    s_lshr_b64 s[6:7], s[0:1], 24
+; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GCN-IR-NEXT:    s_add_i32 s8, s10, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    s_add_i32 s9, s12, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_vccz BB7_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB7_7
+; GCN-IR-NEXT:  BB7_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[6:7], v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB7_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB7_6
+; GCN-IR-NEXT:  BB7_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[6:7], v0
+; GCN-IR-NEXT:    s_add_u32 s6, s2, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s7, s3, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB7_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s7
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s6, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s2, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s3, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB7_5
+; GCN-IR-NEXT:  BB7_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB7_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i48 %x, 24
   %2 = lshr i48 %y, 24
   %result = udiv i48 %1, %2
@@ -689,6 +1025,84 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_udiv_k_num_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s1, s7
+; GCN-IR-NEXT:    s_add_i32 s0, s0, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc5, v0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[2:3], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[2:3], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_cbranch_vccz BB8_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB8_7
+; GCN-IR-NEXT:  BB8_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], 24, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB8_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB8_6
+; GCN-IR-NEXT:  BB8_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v0
+; GCN-IR-NEXT:    s_add_u32 s3, s6, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s8, s7, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB8_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s8
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s3, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s6, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s7, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB8_5
+; GCN-IR-NEXT:  BB8_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB8_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, s2
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 24, %x
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -798,6 +1212,88 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_udiv_pow2_k_num_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
+; GCN-IR-NEXT:    s_mov_b32 s10, 0x8000
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v2
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v2, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB9_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v4
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[10:11], v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB9_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    s_mov_b32 s5, 0
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x8000
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_lshr_b64 v[12:13], s[4:5], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v8, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v6, s[4:5], v9, v13, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, v12, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v13, vcc, v13, v11, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB9_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB9_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v7, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v6, v0
+; GCN-IR-NEXT:  BB9_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 32768, %x
   ret i64 %result
 }
@@ -809,6 +1305,81 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 15
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 15, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_udiv_pow2_k_den_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 48, v2
+; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB10_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v4
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[0:1], v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB10_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT:  BB10_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v1, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v0, v6
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s12, v8
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v8, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, v9, v10, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB10_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB10_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[6:7], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
+; GCN-IR-NEXT:  BB10_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, 32768
   ret i64 %result
 }
@@ -921,6 +1492,81 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_udiv_k_den_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s7
+; GCN-IR-NEXT:    s_add_i32 s3, s0, 32
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 59, v0
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[2:3], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[2:3], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GCN-IR-NEXT:    s_cbranch_vccz BB11_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB11_7
+; GCN-IR-NEXT:  BB11_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[6:7], v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB11_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB11_6
+; GCN-IR-NEXT:  BB11_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[6:7], v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB11_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], 23, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], 0, v9, s[0:1]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 24, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v6
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v9, s[0:1], 0, v9, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB11_5
+; GCN-IR-NEXT:  BB11_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB11_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 %x, 24
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -1027,6 +1673,79 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v10, v8, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_udiv_k_den_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 59, v2
+; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB12_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v4
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[0:1], v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB12_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:  BB12_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v1, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v0, v6
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e64 v0, s[4:5], 23, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, v9, s[4:5]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 24, v0
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v0
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v9, vcc, 0, v9, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB12_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB12_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[6:7], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
+; GCN-IR-NEXT:  BB12_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, 24
   ret i64 %result
 }
@@ -1038,35 +1757,45 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s2, 0x41c00000
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_lshr_b32 s2, s3, 8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_cmp_gt_u32_e64 s[0:1], 25, v1
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, 24, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
-; GCN-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
+; GCN-NEXT:    s_lshr_b32 s0, s3, 8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_udiv24_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s2, 0x41c00000
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_lshr_b32 s0, s3, 8
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 24, %x.shr
   store i64 %result, i64 addrspace(1)* %out
@@ -1080,37 +1809,43 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_movk_i32 s2, 0x5b7f
-; GCN-NEXT:    s_movk_i32 s8, 0x5b7e
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
+; GCN-NEXT:    s_mov_b32 s2, 0x46b6fe00
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_lshr_b32 s3, s3, 8
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s3
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s3, v1
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s3, v1
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s8, v1
-; GCN-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
+; GCN-NEXT:    s_lshr_b32 s0, s3, 8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mad_f32 v0, -v1, s2, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s2
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_udiv24_k_den_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s2, 0x46b6fe00
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_lshr_b32 s0, s3, 8
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s2, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s2
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 %x.shr, 23423
   store i64 %result, i64 addrspace(1)* %out
@@ -1122,31 +1857,35 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, v0
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, v1
-; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], v1, v2
-; GCN-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, -1, v1
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 25, v2
-; GCN-NEXT:    v_sub_i32_e64 v2, s[4:5], 24, v2
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_udiv24_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s4
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 24, %x.shr
   ret i64 %result
@@ -1157,32 +1896,35 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
-; GCN-NEXT:    s_mov_b32 s6, 0x8001
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GCN-NEXT:    s_mov_b32 s4, 0x47000000
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, v0
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, v1
-; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], v1, v2
-; GCN-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 17, v1
-; GCN-NEXT:    v_mul_u32_u24_e32 v2, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, -1, v1
-; GCN-NEXT:    v_cmp_gt_u32_e64 s[4:5], s6, v2
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0x8000, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
-; GCN-NEXT:    s_and_b64 vcc, vcc, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_udiv24_pow2_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s4
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 32768, %x.shr
   ret i64 %result
@@ -1195,6 +1937,22 @@ define i64 @v_test_udiv24_pow2_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 23, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_udiv24_pow2_k_den_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38000000, v0
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s4, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 %x.shr, 32768
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 220f6ad57ddef..845d862eb0db5 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
 define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_urem_i64:
@@ -122,6 +123,106 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_urem_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s2
+; GCN-IR-NEXT:    s_add_i32 s11, s0, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s3
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-IR-NEXT:    s_add_i32 s8, s10, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB0_7
+; GCN-IR-NEXT:  BB0_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[6:7], v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB0_6
+; GCN-IR-NEXT:  BB0_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[6:7], v0
+; GCN-IR-NEXT:    s_add_u32 s8, s2, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s9, s3, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB0_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s9
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s8, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s2, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s3, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
+; GCN-IR-NEXT:  BB0_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB0_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s10, -1
+; GCN-IR-NEXT:    s_mov_b32 s8, s4
+; GCN-IR-NEXT:    s_mov_b32 s9, s5
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -239,6 +340,95 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_urem_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v6, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v1
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 32, v6
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v4, v5
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v1, 0, s[6:7]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v0, 0, s[6:7]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB1_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 63, v6
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[0:1], v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB1_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[0:1], v4
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, -1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, -1, v5, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v14, v14, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v9, v13, v9
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v12, v8
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v10, v14
+; GCN-IR-NEXT:    v_subb_u32_e64 v6, s[4:5], v11, v15, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v14, vcc, v14, v12
+; GCN-IR-NEXT:    v_subb_u32_e32 v15, vcc, v15, v13, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB1_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[8:9], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:  BB1_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GCN-IR-NEXT:    v_mul_hi_u32 v6, v2, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, v3, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v6, v5
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %x, %y
   ret i64 %result
 }
@@ -247,40 +437,56 @@ define amdgpu_kernel void @s_test_urem31_i64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_urem31_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    s_lshr_b32 s3, s7, 1
-; GCN-NEXT:    s_lshr_b32 s4, s2, 1
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_lshr_b32 s4, s7, 1
+; GCN-NEXT:    s_lshr_b32 s5, s8, 1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s4
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s4
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s3
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s4, v1
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s3, v0
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v1
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v1
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_urem31_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 1
+; GCN-IR-NEXT:    s_lshr_b32 s5, s8, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = urem i64 %1, %2
@@ -291,66 +497,87 @@ define amdgpu_kernel void @s_test_urem31_i64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; GCN-LABEL: s_test_urem31_v2i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x11
+; GCN-NEXT:    s_mov_b32 s15, 0xf000
+; GCN-NEXT:    s_mov_b32 s14, -1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_brev_b32 s0, -2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s4, s13, 1
-; GCN-NEXT:    s_lshr_b32 s6, s15, 1
-; GCN-NEXT:    s_lshr_b32 s12, s5, 1
-; GCN-NEXT:    s_lshr_b32 s5, s7, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s12
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s5
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, s12
-; GCN-NEXT:    v_mul_lo_u32 v6, v2, s12
-; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 0, v4
-; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v7, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s[2:3]
-; GCN-NEXT:    v_mul_hi_u32 v3, v3, v0
-; GCN-NEXT:    v_mul_hi_u32 v4, v4, v2
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v3, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v3, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v2
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v4, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[2:3]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s6
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s12
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v0
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s4, v2
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, s5, v3
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s6, v0
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v3
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v3
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, s12, v4
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], s4, v2
-; GCN-NEXT:    v_cmp_le_u32_e64 s[6:7], s12, v4
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s12, v4
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GCN-NEXT:    s_and_b64 vcc, s[6:7], s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v0, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v6, v3, s[4:5]
+; GCN-NEXT:    s_lshr_b32 s1, s7, 1
+; GCN-NEXT:    s_lshr_b32 s2, s5, 1
+; GCN-NEXT:    s_lshr_b32 s3, s11, 1
+; GCN-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s3
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GCN-NEXT:    v_mul_f32_e32 v5, v0, v5
+; GCN-NEXT:    v_mul_f32_e32 v6, v3, v6
+; GCN-NEXT:    v_trunc_f32_e32 v5, v5
+; GCN-NEXT:    v_trunc_f32_e32 v6, v6
+; GCN-NEXT:    v_mad_f32 v0, -v5, v2, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GCN-NEXT:    v_mad_f32 v3, -v6, v4, v3
+; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s3
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
+; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
+; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_urem31_v2i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x11
+; GCN-IR-NEXT:    s_mov_b32 s15, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s14, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_brev_b32 s0, -2
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_lshr_b32 s1, s7, 1
+; GCN-IR-NEXT:    s_lshr_b32 s2, s5, 1
+; GCN-IR-NEXT:    s_lshr_b32 s3, s11, 1
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v4, s3
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GCN-IR-NEXT:    v_mul_f32_e32 v5, v0, v5
+; GCN-IR-NEXT:    v_mul_f32_e32 v6, v3, v6
+; GCN-IR-NEXT:    v_trunc_f32_e32 v5, v5
+; GCN-IR-NEXT:    v_trunc_f32_e32 v6, v6
+; GCN-IR-NEXT:    v_mad_f32 v0, -v5, v2, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GCN-IR-NEXT:    v_mad_f32 v3, -v6, v4, v3
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v0, s0, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v2, s0, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = lshr <2 x i64> %x, <i64 33, i64 33>
   %2 = lshr <2 x i64> %y, <i64 33, i64 33>
   %result = urem <2 x i64> %1, %2
@@ -362,40 +589,56 @@ define amdgpu_kernel void @s_test_urem24_i64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_urem24_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    s_lshr_b32 s3, s7, 8
-; GCN-NEXT:    s_lshr_b32 s4, s2, 8
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_lshr_b32 s4, s7, 8
+; GCN-NEXT:    s_lshr_b32 s5, s8, 8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s4
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s4
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s3
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s4, v1
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s3, v0
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v1
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v1
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_urem24_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 8
+; GCN-IR-NEXT:    s_lshr_b32 s5, s8, 8
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v0, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, v1, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
   %result = urem i64 %1, %2
@@ -406,55 +649,87 @@ define amdgpu_kernel void @s_test_urem24_i64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; GCN-LABEL: s_test_urem23_64_v2i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x11
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x11
+; GCN-NEXT:    s_mov_b32 s15, 0xf000
+; GCN-NEXT:    s_mov_b32 s14, -1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_brev_b32 s0, -2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s2, s11, 9
-; GCN-NEXT:    s_lshr_b32 s3, s9, 1
-; GCN-NEXT:    s_lshr_b32 s8, s15, 9
-; GCN-NEXT:    s_lshr_b32 s9, s13, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s8
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v3
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v0, s9
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, s9
-; GCN-NEXT:    v_mad_f32 v2, -v4, v3, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 0, v6
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v3|
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s8
-; GCN-NEXT:    v_mul_hi_u32 v3, v3, v0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v3, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s3
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s9
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, s9, v3
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s3, v0
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v3
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s9, v3
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[0:1]
-; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffff, v2
+; GCN-NEXT:    s_lshr_b32 s1, s7, 9
+; GCN-NEXT:    s_lshr_b32 s2, s5, 1
+; GCN-NEXT:    s_lshr_b32 s3, s11, 9
+; GCN-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s3
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GCN-NEXT:    v_mul_f32_e32 v5, v0, v5
+; GCN-NEXT:    v_mul_f32_e32 v6, v3, v6
+; GCN-NEXT:    v_trunc_f32_e32 v5, v5
+; GCN-NEXT:    v_trunc_f32_e32 v6, v6
+; GCN-NEXT:    v_mad_f32 v0, -v5, v2, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GCN-NEXT:    v_mad_f32 v3, -v6, v4, v3
+; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s3
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
+; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
+; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_urem23_64_v2i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x11
+; GCN-IR-NEXT:    s_mov_b32 s15, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s14, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_brev_b32 s0, -2
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_lshr_b32 s1, s7, 9
+; GCN-IR-NEXT:    s_lshr_b32 s2, s5, 1
+; GCN-IR-NEXT:    s_lshr_b32 s3, s11, 9
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v4, s3
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GCN-IR-NEXT:    v_mul_f32_e32 v5, v0, v5
+; GCN-IR-NEXT:    v_mul_f32_e32 v6, v3, v6
+; GCN-IR-NEXT:    v_trunc_f32_e32 v5, v5
+; GCN-IR-NEXT:    v_trunc_f32_e32 v6, v6
+; GCN-IR-NEXT:    v_mad_f32 v0, -v5, v2, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GCN-IR-NEXT:    v_mad_f32 v3, -v6, v4, v3
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v0, s0, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v2, s0, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GCN-IR-NEXT:    s_endpgm
   %1 = lshr <2 x i64> %x, <i64 33, i64 41>
   %2 = lshr <2 x i64> %y, <i64 33, i64 41>
   %result = urem <2 x i64> %1, %2
@@ -573,6 +848,93 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_urem_k_num_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s1, s7
+; GCN-IR-NEXT:    s_add_i32 s0, s0, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc5, v0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[2:3], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[2:3], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_cbranch_vccz BB6_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB6_7
+; GCN-IR-NEXT:  BB6_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], 24, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB6_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB6_6
+; GCN-IR-NEXT:  BB6_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v0
+; GCN-IR-NEXT:    s_add_u32 s3, s6, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_addc_u32 s8, s7, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB6_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, s8
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], s3, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], v10, v9, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v7, s6, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, s7, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[0:1], v9, v6, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB6_5
+; GCN-IR-NEXT:  BB6_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB6_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s6, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 24, %x
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -684,6 +1046,90 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_urem_k_den_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s7
+; GCN-IR-NEXT:    s_add_i32 s3, s0, 32
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 59, v0
+; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[2:3], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 63, v[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[2:3], s[0:1], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GCN-IR-NEXT:    s_cbranch_vccz BB7_2
+; GCN-IR-NEXT:  ; %bb.1:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB7_7
+; GCN-IR-NEXT:  BB7_2: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v2
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[6:7], v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB7_4
+; GCN-IR-NEXT:  ; %bb.3:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_branch BB7_6
+; GCN-IR-NEXT:  BB7_4: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[6:7], v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB7_5: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[0:1], 23, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v2, s[0:1], 0, v9, s[0:1]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 24, v6
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[0:1], v8, v6
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v9, s[0:1], 0, v9, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT:    s_cbranch_vccz BB7_5
+; GCN-IR-NEXT:  BB7_6: ; %udiv-loop-exit
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB7_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, v0, 24
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, 24
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -793,6 +1239,94 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_urem_pow2_k_num_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
+; GCN-IR-NEXT:    s_mov_b32 s10, 0x8000
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v2
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB8_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v4
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[10:11], v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB8_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    s_mov_b32 s5, 0
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x8000
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_lshr_b64 v[12:13], s[4:5], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:  BB8_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v8, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v6, s[4:5], v9, v13, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, v12, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v13, vcc, v13, v11, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB8_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB8_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v7, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v6, v2
+; GCN-IR-NEXT:  BB8_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, v0, v3
+; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v4, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
+; GCN-IR-NEXT:    v_add_i32_e64 v1, s[4:5], v2, v1
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 32768, %x
   ret i64 %result
 }
@@ -804,6 +1338,82 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_urem_pow2_k_den_i64:
+; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 48, v2
+; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v1, 0, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[6:7]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB9_6
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v4
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[0:1], v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz BB9_5
+; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
+; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v9, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v8, v6
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s12, v10
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v11, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v4
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 0x8000, v8
+; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v11, v12, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz BB9_3
+; GCN-IR-NEXT:  ; %bb.4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:  BB9_5: ; %Flow1
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[6:7], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v5, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v4, v2
+; GCN-IR-NEXT:  BB9_6: ; %Flow2
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 15
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %x, 32768
   ret i64 %result
 }
@@ -815,35 +1425,49 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s2, 0x41c00000
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_lshr_b32 s8, s3, 8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s8
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, 24, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s8, v1
-; GCN-NEXT:    v_cmp_gt_u32_e64 s[0:1], 25, v0
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v1
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v1
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
+; GCN-NEXT:    s_lshr_b32 s0, s3, 8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_urem24_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b32 s2, 0x41c00000
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_lshr_b32 s0, s3, 8
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = urem i64 24, %x.shr
   store i64 %result, i64 addrspace(1)* %out
@@ -856,38 +1480,50 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s8, 0x46b6fe00
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_movk_i32 s2, 0x5b7f
-; GCN-NEXT:    s_movk_i32 s8, 0x5b7e
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_lshr_b32 s3, s3, 8
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s3
+; GCN-NEXT:    s_lshr_b32 s0, s3, 8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mad_f32 v0, -v1, s8, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s8
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s2, v1
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s3, v0
-; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s8, v1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0xffffa481, v1
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-IR-LABEL: s_test_urem24_k_den_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_mov_b32 s8, 0x46b6fe00
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_movk_i32 s2, 0x5b7f
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_lshr_b32 s0, s3, 8
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s8, v0
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s8
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = urem i64 %x.shr, 23423
   store i64 %result, i64 addrspace(1)* %out
@@ -899,31 +1535,39 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, v0
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, v1
-; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], v1, v2
-; GCN-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, v0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 24, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v2, v0
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 25, v1
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
-; GCN-NEXT:    v_sub_i32_e64 v0, s[6:7], v2, v0
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v3, -v2, v1, s4
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_urem24_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v1, s4
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = urem i64 24, %x.shr
   ret i64 %result
@@ -934,32 +1578,39 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
-; GCN-NEXT:    s_mov_b32 s6, 0x8001
+; GCN-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, v0
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GCN-NEXT:    v_mul_hi_u32 v2, v2, v1
-; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], v1, v2
-; GCN-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 17, v1
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v1, v0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0x8000, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v2, v0
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v1
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
-; GCN-NEXT:    v_sub_i32_e64 v0, s[6:7], v2, v0
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v3, -v2, v1, s4
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_urem24_pow2_k_num_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, s4, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v1, s4
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = urem i64 32768, %x.shr
   ret i64 %result
@@ -972,6 +1623,24 @@ define i64 @v_test_urem24_pow2_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_bfe_u32 v0, v1, 8, 15
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IR-LABEL: v_test_urem24_pow2_k_den_i64:
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x38000000, v1
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v1, -v2, s4, v1
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, s4
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_lshlrev_b32_e32 v1, 15, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = urem i64 %x.shr, 32768
   ret i64 %result
diff --git a/llvm/test/CodeGen/Hexagon/swp-sigma.ll b/llvm/test/CodeGen/Hexagon/swp-sigma.ll
index 3ab88b8d84631..1651742820998 100644
--- a/llvm/test/CodeGen/Hexagon/swp-sigma.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-sigma.ll
@@ -2,28 +2,11 @@
 
 ; We do not pipeline sigma yet, but the non-pipelined version
 ; with good scheduling is pretty fast. The compiler generates
-; 19 packets, and the assembly version is 16.
+; 18 packets, and the assembly version is 16.
 
 ; CHECK:  loop0(.LBB0_[[LOOP:.]],
 ; CHECK: .LBB0_[[LOOP]]:
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
-; CHECK: }
+; CHECK-COUNT-17: }
 ; CHECK: }{{[ \t]*}}:endloop
 
 @g0 = external constant [10 x i16], align 128
diff --git a/llvm/test/CodeGen/Hexagon/vect-regpairs.ll b/llvm/test/CodeGen/Hexagon/vect-regpairs.ll
new file mode 100644
index 0000000000000..4d505fc2f4eef
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect-regpairs.ll
@@ -0,0 +1,134 @@
+;RUN: llc -march=hexagon -mcpu=hexagonv66 -mhvx -filetype=obj < %s -o - | llvm-objdump -mv66 -mhvx -d - | FileCheck --check-prefix=CHECK-V66 %s
+;RUN: llc -march=hexagon -mcpu=hexagonv67 -mhvx -filetype=obj < %s -o - | llvm-objdump -mv67 -mhvx -d - | FileCheck --check-prefix=CHECK-V67 %s
+
+; Should not attempt to use v<even>:<odd> 'reverse' vector regpairs
+; on old or new arches (should not crash).
+
+; CHECK-V66: vcombine
+; CHECK-V67: vcombine
+declare <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.hexagon.V6.vd0()
+declare <32 x i32> @llvm.hexagon.V6.vmpybus(<16 x i32>, i32)
+declare <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32>, <32 x i32>, i32)
+declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>)
+declare <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32>, <16 x i32>, i32 )
+declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>)
+declare <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32>, <16 x i32>, i32 )
+declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.hexagon.V6.vmpyihb.acc(<16 x i32>, <16 x i32>, i32)
+declare <16 x i32> @llvm.hexagon.V6.vasrhubrndsat(<16 x i32>, <16 x i32>, i32)
+
+declare <32 x i32> @llvm.hexagon.V6.vaddubh(<16 x i32>, <16 x i32>)
+declare <32 x i32> @llvm.hexagon.V6.vmpybus.acc(<32 x i32>, <16 x i32>, i32)
+declare <16 x i32> @llvm.hexagon.V6.vmpyiwb.acc(<16 x i32>, <16 x i32>, i32)
+declare <16 x i32> @llvm.hexagon.V6.vshuffob(<16 x i32>, <16 x i32>)
+
+
+define void @Gaussian7x7u8PerRow(i8* %src, i32 %stride, i32 %width, i8* %dst) #0 {
+entry:
+  %mul = mul i32 %stride, 3
+  %idx.neg = sub i32 0, %mul
+  %add.ptr = getelementptr i8, i8* %src, i32 %idx.neg
+  bitcast i8* %add.ptr to <16 x i32>*
+  %mul1 = shl i32 %stride, 1
+  %idx.neg2 = sub i32 0, %mul1
+  %add.ptr3 = getelementptr i8, i8* %src, i32 %idx.neg2
+  bitcast i8* %add.ptr3 to <16 x i32>*
+  %idx.neg5 = sub i32 0, %stride
+  %add.ptr6 = getelementptr i8, i8* %src, i32 %idx.neg5
+  bitcast i8* %add.ptr6 to <16 x i32>*
+  bitcast i8* %src to <16 x i32>*
+  %add.ptr10 = getelementptr i8, i8* %src, i32 %stride
+  bitcast i8* %add.ptr10 to <16 x i32>*
+  %add.ptr12 = getelementptr i8, i8* %src, i32 %mul1
+  bitcast i8* %add.ptr12 to <16 x i32>*
+  %add.ptr14 = getelementptr i8, i8* %src, i32 %mul
+  bitcast i8* %add.ptr14 to <16 x i32>*
+  bitcast i8* %dst to <16 x i32>*
+  load <16 x i32>, <16 x i32>* %0load <16 x i32>, <16 x i32>* %1load <16 x i32>, <16 x i32>* %2load <16 x i32>, <16 x i32>* %3load <16 x i32>, <16 x i32>* %4load <16 x i32>, <16 x i32>* %5load <16 x i32>, <16 x i32>* %6call <16 x i32> @llvm.hexagon.V6.vd0()
+  call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %15, <16 x i32> %15)
+  call <32 x i32> @llvm.hexagon.V6.vaddubh(<16 x i32> %14, <16 x i32> %8)
+  call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %13, <16 x i32> %9)
+  call <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32> %17, <32 x i32> %18, i32 101058054)
+  call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %12, <16 x i32> %10)
+  call <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32> %19, <32 x i32> %20, i32 252645135)
+  call <32 x i32> @llvm.hexagon.V6.vmpybus.acc(<32 x i32> %21, <16 x i32> %11, i32 336860180)
+  %cmp155 = icmp sgt i32 %width, 64
+  br i1 %cmp155, label %for.body.preheader, label %for.end
+for.body.preheader:                               %incdec.ptr20 = getelementptr i8, i8* %add.ptr14%23 = bitcast i8* %incdec.ptr20 to <16 x i32>*
+  %incdec.ptr19 = getelementptr i8, i8* %add.ptr12%24 = bitcast i8* %incdec.ptr19 to <16 x i32>*
+  %incdec.ptr18 = getelementptr i8, i8* %add.ptr10%25 = bitcast i8* %incdec.ptr18 to <16 x i32>*
+  %incdec.ptr17 = getelementptr i8, i8* %src%26 = bitcast i8* %incdec.ptr17 to <16 x i32>*
+  %incdec.ptr16 = getelementptr i8, i8* %add.ptr6%27 = bitcast i8* %incdec.ptr16 to <16 x i32>*
+  %incdec.ptr15 = getelementptr i8, i8* %add.ptr3%28 = bitcast i8* %incdec.ptr15 to <16 x i32>*
+  %incdec.ptr = getelementptr i8, i8* %add.ptr%29 = bitcast i8* %incdec.ptr to <16 x i32>*
+  br label %for.body
+for.body:                                         %optr.0166 = phi <16 x i32>* [ %incdec.ptr28, %for.body ], [ %7, %for.body.preheader ]
+  %iptr6.0165 = phi <16 x i32>* [ %incdec.ptr27, %for.body ], [ %23, %for.body.preheader ]
+  %iptr5.0164 = phi <16 x i32>* [ %incdec.ptr26, %for.body ], [ %24, %for.body.preheader ]
+  %iptr4.0163 = phi <16 x i32>* [ %incdec.ptr25, %for.body ], [ %25, %for.body.preheader ]
+  %iptr3.0162 = phi <16 x i32>* [ %incdec.ptr24, %for.body ], [ %26, %for.body.preheader ]
+  %iptr2.0161 = phi <16 x i32>* [ %incdec.ptr23, %for.body ], [ %27, %for.body.preheader ]
+  %iptr1.0160 = phi <16 x i32>* [ %incdec.ptr22, %for.body ], [ %28, %for.body.preheader ]
+  %iptr0.0159 = phi <16 x i32>* [ %incdec.ptr21, %for.body ], [ %29, %for.body.preheader ]
+  %dXV1.0158 = phi <32 x i32> [ %49, %for.body ], [ %22, %for.body.preheader ]
+  %dXV0.0157 = phi <32 x i32> [ %dXV1.0158, %for.body ], [ %16, %for.body.preheader ]
+  %i.0156 = phi i32 [ %sub, %for.body ], [ %width, %for.body.preheader ]
+  %incdec.ptr21 = getelementptr <16 x i32>, <16 x i32>* %iptr0.0159%30 = load <16 x i32>, <16 x i32>* %iptr0.0159%incdec.ptr22 = getelementptr <16 x i32>, <16 x i32>* %iptr1.0160%31 = load <16 x i32>, <16 x i32>* %iptr1.0160%incdec.ptr23 = getelementptr <16 x i32>, <16 x i32>* %iptr2.0161%32 = load <16 x i32>, <16 x i32>* %iptr2.0161%incdec.ptr24 = getelementptr <16 x i32>, <16 x i32>* %iptr3.0162%33 = load <16 x i32>, <16 x i32>* %iptr3.0162%incdec.ptr25 = getelementptr <16 x i32>, <16 x i32>* %iptr4.0163%34 = load <16 x i32>, <16 x i32>* %iptr4.0163%incdec.ptr26 = getelementptr <16 x i32>, <16 x i32>* %iptr5.0164%35 = load <16 x i32>, <16 x i32>* %iptr5.0164%incdec.ptr27 = getelementptr <16 x i32>, <16 x i32>* %iptr6.0165%36 = load <16 x i32>, <16 x i32>* %iptr6.0165, !tbaa !8
+  call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %dXV1.0158)
+  call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %dXV0.0157)
+  call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %37, <16 x i32> %38, i32 2)
+  call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %dXV1.0158)
+  call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %dXV0.0157)
+  call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %40, <16 x i32> %41, i32 2)
+  call <16 x i32> @llvm.hexagon.V6.vlalignbi(<16 x i32> %37, <16 x i32> %38, i32 4)
+  call <32 x i32> @llvm.hexagon.V6.vaddubh(<16 x i32> %36, <16 x i32> %30)
+  call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %35, <16 x i32> %31)
+  call <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32> %44, <32 x i32> %45, i32 101058054)
+  call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %34, <16 x i32> %32)
+  call <32 x i32> @llvm.hexagon.V6.vmpabus.acc(<32 x i32> %46, <32 x i32> %47, i32 252645135)
+  call <32 x i32> @llvm.hexagon.V6.vmpybus.acc(<32 x i32> %48, <16 x i32> %33, i32 336860180)
+  call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %49)
+  call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %50, <16 x i32> %40, i32 2)
+  call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %49)
+  call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %52, <16 x i32> %37, i32 2)
+  call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %50, <16 x i32> %40, i32 4)
+  call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %37, <16 x i32> %39)
+  call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %55, <16 x i32> %40)
+  call <32 x i32> @llvm.hexagon.V6.vmpahb(<32 x i32> %56, i32 252972820)
+  call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %51, <16 x i32> %40)
+  call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %58, <16 x i32> %37)
+  call <32 x i32> @llvm.hexagon.V6.vmpahb(<32 x i32> %59, i32 252972820)
+  call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %53, <16 x i32> %43)
+  call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %51, <16 x i32> %42)
+  call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %61, <16 x i32> %62)
+  call <32 x i32> @llvm.hexagon.V6.vmpahb.acc(<32 x i32> %57, <32 x i32> %63, i32 17170694)
+  call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %54, <16 x i32> %42)
+  call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %53, <16 x i32> %39)
+  call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %65, <16 x i32> %66)
+  call <32 x i32> @llvm.hexagon.V6.vmpahb.acc(<32 x i32> %60, <32 x i32> %67, i32 17170694)
+  call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %64)
+  call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %64)
+  call <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32> %69, <16 x i32> %70, i32 12)
+  call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %68)
+  call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %68)
+  call <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32> %72, <16 x i32> %73, i32 12)
+  call <16 x i32> @llvm.hexagon.V6.vshuffeb(<16 x i32> %74, <16 x i32> %71)
+  %incdec.ptr28 = getelementptr <16 x i32>, <16 x i32>* %1
+  store <16 x i32> %75, <16 x i32>* %optr.0166%sub = add i32 %i.0156, -64
+  %cmp = icmp sgt i32 %sub, 64
+  br i1 %cmp, label %for.body, label %for.end
+for.end:                                          ret void
+}
+declare <32 x i32> @llvm.hexagon.V6.vmpahb(<32 x i32>, i32)
+declare <32 x i32> @llvm.hexagon.V6.vmpahb.acc(<32 x i32>, <32 x i32>, i32)
+declare <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32>, <16 x i32>, i32)
+declare <16 x i32> @llvm.hexagon.V6.vshuffeb(<16 x i32>, <16 x i32>)
+
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math""target-cpu"="hexagonv65" "target-features"="+hvx-length64b,+hvxv65,+v65,-long-calls" "unsafe-fp-math"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"omnipotent char", !10}
+!10 = !{}
+!14 = !{}
+!19 = !{}
+!24 = !{}
diff --git a/llvm/test/CodeGen/X86/fast-isel-float-half-convertion.ll b/llvm/test/CodeGen/X86/fast-isel-float-half-convertion.ll
index acb85fd171f54..43a26c123e78f 100644
--- a/llvm/test/CodeGen/X86/fast-isel-float-half-convertion.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-float-half-convertion.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -fast-isel -fast-isel-abort=1 -asm-verbose=false -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort=1 -asm-verbose=false -mtriple=x86_64-unknown-unknown -mattr=+avx512vl < %s | FileCheck %s
 
 ; Verify that fast-isel correctly expands float-half conversions.
 
@@ -14,7 +15,7 @@ entry:
 
 define float @test_fp16_to_fp32(i32 %a) {
 ; CHECK-LABEL: test_fp16_to_fp32:
-; CHECK: movswl %di, %eax
+; CHECK: movzwl %di, %eax
 ; CHECK-NEXT: vmovd %eax, %xmm0
 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
 ; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
index 65befee085c03..0b6fb97ef913d 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -1,57 +1,257 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=nehalem     | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-EST
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=sandybridge | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=broadwell   | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=skylake     | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-ACC
-
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=+fast-scalar-fsqrt,-fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=-fast-scalar-fsqrt,+fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-ACC
-
-declare float @llvm.sqrt.f32(float) #0
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
-declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
-
-define float @foo_x1(float %f) #0 {
-; SCALAR-EST-LABEL: foo_x1:
-; SCALAR-EST:       # %bb.0:
-; SCALAR-EST-NEXT:    rsqrtss %xmm0
-; SCALAR-EST:         retq
-;
-; SCALAR-ACC-LABEL: foo_x1:
-; SCALAR-ACC:       # %bb.0:
-; SCALAR-ACC-NEXT:    {{^ *v?sqrtss %xmm0}}
-; SCALAR-ACC-NEXT:    retq
-  %call = tail call float @llvm.sqrt.f32(float %f) #1
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=nehalem     | FileCheck %s --check-prefixes=NHM
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=FAST-SCALAR,SNB
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=broadwell   | FileCheck %s --check-prefixes=FAST-SCALAR,BDW
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake     | FileCheck %s --check-prefixes=FAST-SCALAR,SKL
+
+define float @f32_no_daz(float %f) #0 {
+; NHM-LABEL: f32_no_daz:
+; NHM:       # %bb.0:
+; NHM-NEXT:    rsqrtss %xmm0, %xmm1
+; NHM-NEXT:    movaps %xmm0, %xmm2
+; NHM-NEXT:    mulss %xmm1, %xmm2
+; NHM-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NHM-NEXT:    mulss %xmm2, %xmm3
+; NHM-NEXT:    mulss %xmm1, %xmm2
+; NHM-NEXT:    addss {{.*}}(%rip), %xmm2
+; NHM-NEXT:    andps {{.*}}(%rip), %xmm0
+; NHM-NEXT:    mulss %xmm3, %xmm2
+; NHM-NEXT:    cmpltss {{.*}}(%rip), %xmm0
+; NHM-NEXT:    andnps %xmm2, %xmm0
+; NHM-NEXT:    retq
+;
+; FAST-SCALAR-LABEL: f32_no_daz:
+; FAST-SCALAR:       # %bb.0:
+; FAST-SCALAR-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; FAST-SCALAR-NEXT:    retq
+  %call = tail call fast float @llvm.sqrt.f32(float %f) #2
+  ret float %call
+}
+
+define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 {
+; NHM-LABEL: v4f32_no_daz:
+; NHM:       # %bb.0:
+; NHM-NEXT:    rsqrtps %xmm0, %xmm2
+; NHM-NEXT:    movaps %xmm0, %xmm1
+; NHM-NEXT:    mulps %xmm2, %xmm1
+; NHM-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; NHM-NEXT:    mulps %xmm1, %xmm3
+; NHM-NEXT:    mulps %xmm2, %xmm1
+; NHM-NEXT:    addps {{.*}}(%rip), %xmm1
+; NHM-NEXT:    andps {{.*}}(%rip), %xmm0
+; NHM-NEXT:    mulps %xmm3, %xmm1
+; NHM-NEXT:    movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; NHM-NEXT:    cmpleps %xmm0, %xmm2
+; NHM-NEXT:    andps %xmm2, %xmm1
+; NHM-NEXT:    movaps %xmm1, %xmm0
+; NHM-NEXT:    retq
+;
+; SNB-LABEL: v4f32_no_daz:
+; SNB:       # %bb.0:
+; SNB-NEXT:    vrsqrtps %xmm0, %xmm1
+; SNB-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; SNB-NEXT:    vmulps {{.*}}(%rip), %xmm2, %xmm3
+; SNB-NEXT:    vmulps %xmm1, %xmm2, %xmm1
+; SNB-NEXT:    vaddps {{.*}}(%rip), %xmm1, %xmm1
+; SNB-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; SNB-NEXT:    vmulps %xmm1, %xmm3, %xmm1
+; SNB-NEXT:    vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; SNB-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
+; SNB-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; SNB-NEXT:    retq
+;
+; BDW-LABEL: v4f32_no_daz:
+; BDW:       # %bb.0:
+; BDW-NEXT:    vrsqrtps %xmm0, %xmm1
+; BDW-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; BDW-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; BDW-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
+; BDW-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; BDW-NEXT:    vmulps %xmm1, %xmm2, %xmm1
+; BDW-NEXT:    vmulps %xmm3, %xmm1, %xmm1
+; BDW-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; BDW-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; BDW-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; BDW-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
+; BDW-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; BDW-NEXT:    retq
+;
+; SKL-LABEL: v4f32_no_daz:
+; SKL:       # %bb.0:
+; SKL-NEXT:    vsqrtps %xmm0, %xmm0
+; SKL-NEXT:    retq
+  %call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2
+  ret <4 x float> %call
+}
+
+define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 {
+; NHM-LABEL: v8f32_no_daz:
+; NHM:       # %bb.0:
+; NHM-NEXT:    sqrtps %xmm0, %xmm0
+; NHM-NEXT:    sqrtps %xmm1, %xmm1
+; NHM-NEXT:    retq
+;
+; SNB-LABEL: v8f32_no_daz:
+; SNB:       # %bb.0:
+; SNB-NEXT:    vrsqrtps %ymm0, %ymm1
+; SNB-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; SNB-NEXT:    vmulps {{.*}}(%rip), %ymm2, %ymm3
+; SNB-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; SNB-NEXT:    vaddps {{.*}}(%rip), %ymm1, %ymm1
+; SNB-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; SNB-NEXT:    vmulps %ymm1, %ymm3, %ymm1
+; SNB-NEXT:    vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; SNB-NEXT:    vcmpleps %ymm0, %ymm2, %ymm0
+; SNB-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; SNB-NEXT:    retq
+;
+; BDW-LABEL: v8f32_no_daz:
+; BDW:       # %bb.0:
+; BDW-NEXT:    vrsqrtps %ymm0, %ymm1
+; BDW-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; BDW-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; BDW-NEXT:    vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
+; BDW-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; BDW-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; BDW-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; BDW-NEXT:    vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; BDW-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; BDW-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; BDW-NEXT:    vcmpleps %ymm0, %ymm2, %ymm0
+; BDW-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; BDW-NEXT:    retq
+;
+; SKL-LABEL: v8f32_no_daz:
+; SKL:       # %bb.0:
+; SKL-NEXT:    vsqrtps %ymm0, %ymm0
+; SKL-NEXT:    retq
+  %call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2
+  ret <8 x float> %call
+}
+
+; Repeat all tests with denorms-as-zero enabled.
+
+define float @f32_daz(float %f) #1 {
+; NHM-LABEL: f32_daz:
+; NHM:       # %bb.0:
+; NHM-NEXT:    rsqrtss %xmm0, %xmm1
+; NHM-NEXT:    movaps %xmm0, %xmm2
+; NHM-NEXT:    mulss %xmm1, %xmm2
+; NHM-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NHM-NEXT:    mulss %xmm2, %xmm3
+; NHM-NEXT:    mulss %xmm1, %xmm2
+; NHM-NEXT:    addss {{.*}}(%rip), %xmm2
+; NHM-NEXT:    mulss %xmm3, %xmm2
+; NHM-NEXT:    xorps %xmm1, %xmm1
+; NHM-NEXT:    cmpeqss %xmm1, %xmm0
+; NHM-NEXT:    andnps %xmm2, %xmm0
+; NHM-NEXT:    retq
+;
+; FAST-SCALAR-LABEL: f32_daz:
+; FAST-SCALAR:       # %bb.0:
+; FAST-SCALAR-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; FAST-SCALAR-NEXT:    retq
+  %call = tail call fast float @llvm.sqrt.f32(float %f) #2
   ret float %call
 }
 
-define <4 x float> @foo_x4(<4 x float> %f) #0 {
-; VECTOR-EST-LABEL: foo_x4:
-; VECTOR-EST:       # %bb.0:
-; VECTOR-EST-NEXT:    rsqrtps %xmm0
-; VECTOR-EST:         retq
-;
-; VECTOR-ACC-LABEL: foo_x4:
-; VECTOR-ACC:       # %bb.0:
-; VECTOR-ACC-NEXT:    {{^ *v?sqrtps %xmm0}}
-; VECTOR-ACC-NEXT:    retq
-  %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #1
+define <4 x float> @v4f32_daz(<4 x float> %f) #1 {
+; NHM-LABEL: v4f32_daz:
+; NHM:       # %bb.0:
+; NHM-NEXT:    rsqrtps %xmm0, %xmm1
+; NHM-NEXT:    movaps %xmm0, %xmm2
+; NHM-NEXT:    mulps %xmm1, %xmm2
+; NHM-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; NHM-NEXT:    mulps %xmm2, %xmm3
+; NHM-NEXT:    mulps %xmm1, %xmm2
+; NHM-NEXT:    addps {{.*}}(%rip), %xmm2
+; NHM-NEXT:    mulps %xmm3, %xmm2
+; NHM-NEXT:    xorps %xmm1, %xmm1
+; NHM-NEXT:    cmpneqps %xmm1, %xmm0
+; NHM-NEXT:    andps %xmm2, %xmm0
+; NHM-NEXT:    retq
+;
+; SNB-LABEL: v4f32_daz:
+; SNB:       # %bb.0:
+; SNB-NEXT:    vrsqrtps %xmm0, %xmm1
+; SNB-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; SNB-NEXT:    vmulps {{.*}}(%rip), %xmm2, %xmm3
+; SNB-NEXT:    vmulps %xmm1, %xmm2, %xmm1
+; SNB-NEXT:    vaddps {{.*}}(%rip), %xmm1, %xmm1
+; SNB-NEXT:    vmulps %xmm1, %xmm3, %xmm1
+; SNB-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; SNB-NEXT:    vcmpneqps %xmm2, %xmm0, %xmm0
+; SNB-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; SNB-NEXT:    retq
+;
+; BDW-LABEL: v4f32_daz:
+; BDW:       # %bb.0:
+; BDW-NEXT:    vrsqrtps %xmm0, %xmm1
+; BDW-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; BDW-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; BDW-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
+; BDW-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; BDW-NEXT:    vmulps %xmm1, %xmm2, %xmm1
+; BDW-NEXT:    vmulps %xmm3, %xmm1, %xmm1
+; BDW-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; BDW-NEXT:    vcmpneqps %xmm2, %xmm0, %xmm0
+; BDW-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; BDW-NEXT:    retq
+;
+; SKL-LABEL: v4f32_daz:
+; SKL:       # %bb.0:
+; SKL-NEXT:    vsqrtps %xmm0, %xmm0
+; SKL-NEXT:    retq
+  %call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2
   ret <4 x float> %call
 }
 
-define <8 x float> @foo_x8(<8 x float> %f) #0 {
-; VECTOR-EST-LABEL: foo_x8:
-; VECTOR-EST:       # %bb.0:
-; VECTOR-EST-NEXT:    rsqrtps
-; VECTOR-EST:         retq
-;
-; VECTOR-ACC-LABEL: foo_x8:
-; VECTOR-ACC:       # %bb.0:
-; VECTOR-ACC-NEXT:    {{^ *v?sqrtps %[xy]mm0}}
-; VECTOR-ACC-NOT:     rsqrt
-; VECTOR-ACC:         retq
-  %call = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #1
+define <8 x float> @v8f32_daz(<8 x float> %f) #1 {
+; NHM-LABEL: v8f32_daz:
+; NHM:       # %bb.0:
+; NHM-NEXT:    sqrtps %xmm0, %xmm0
+; NHM-NEXT:    sqrtps %xmm1, %xmm1
+; NHM-NEXT:    retq
+;
+; SNB-LABEL: v8f32_daz:
+; SNB:       # %bb.0:
+; SNB-NEXT:    vrsqrtps %ymm0, %ymm1
+; SNB-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; SNB-NEXT:    vmulps {{.*}}(%rip), %ymm2, %ymm3
+; SNB-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; SNB-NEXT:    vaddps {{.*}}(%rip), %ymm1, %ymm1
+; SNB-NEXT:    vmulps %ymm1, %ymm3, %ymm1
+; SNB-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; SNB-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
+; SNB-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; SNB-NEXT:    retq
+;
+; BDW-LABEL: v8f32_daz:
+; BDW:       # %bb.0:
+; BDW-NEXT:    vrsqrtps %ymm0, %ymm1
+; BDW-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; BDW-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; BDW-NEXT:    vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
+; BDW-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; BDW-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; BDW-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; BDW-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; BDW-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
+; BDW-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; BDW-NEXT:    retq
+;
+; SKL-LABEL: v8f32_daz:
+; SKL:       # %bb.0:
+; SKL-NEXT:    vsqrtps %ymm0, %ymm0
+; SKL-NEXT:    retq
+  %call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2
   ret <8 x float> %call
 }
 
-attributes #0 = { "unsafe-fp-math"="true" }
-attributes #1 = { nounwind readnone }
+declare float @llvm.sqrt.f32(float) #2
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #2
+
+attributes #0 = { "denormal-fp-math"="ieee,ieee" }
+attributes #1 = { "denormal-fp-math"="ieee,preserve-sign" }
+attributes #2 = { nounwind readnone }
diff --git a/llvm/test/MC/Hexagon/hvx-swapped-regpairs-alias-neg.s b/llvm/test/MC/Hexagon/hvx-swapped-regpairs-alias-neg.s
new file mode 100644
index 0000000000000..1988f90dc56e3
--- /dev/null
+++ b/llvm/test/MC/Hexagon/hvx-swapped-regpairs-alias-neg.s
@@ -0,0 +1,15 @@
+# RUN: not llvm-mc -arch=hexagon -mcpu=hexagonv67 -mhvx -filetype=asm %s 2>%t; FileCheck  --implicit-check-not="error:" %s <%t
+
+{
+  v1:0 = #0
+  v0:1 = #0
+}
+# CHECK: error: register `V1' modified more than once
+
+## Unused .tmp:
+{
+  v1.tmp = vmem(r0 + #3)
+  v0:1 = vaddw(v17:16, v17:16)
+}
+
+# CHECK: warning: register `V1' used with `.tmp' but not used in the same packet
diff --git a/llvm/test/MC/Hexagon/hvx-swapped-regpairs.s b/llvm/test/MC/Hexagon/hvx-swapped-regpairs.s
new file mode 100644
index 0000000000000..1ddec177e7838
--- /dev/null
+++ b/llvm/test/MC/Hexagon/hvx-swapped-regpairs.s
@@ -0,0 +1,43 @@
+# RUN: llvm-mc -filetype=obj -arch=hexagon -mcpu=hexagonv67 -mhvx %s | llvm-objdump -d -mcpu=hexagonv67 -mhvx - | FileCheck %s
+# RUN: not llvm-mc -arch=hexagon -mcpu=hexagonv65 -mhvx -filetype=asm %s 2>%t; FileCheck --check-prefix=CHECK-V65 --implicit-check-not="error:" %s <%t
+
+v1:0.w = vadd(v0.h, v1.h) // Normal
+# CHECK: 1ca1c080
+
+v0:1.w = vadd(v0.h, v1.h) // Swapped
+# CHECK-NEXT: 1ca1c081
+# CHECK-V65: error: register pair `WR0' is not permitted for this architecture
+
+## Swapped use:
+v1:0.w = vtmpy(v0:1.h,r0.b)
+# CHECK-NEXT: 19a0c180
+# CHECK-V65: error: register pair `WR0' is not permitted for this architecture
+
+## Swapped def
+v0:1 = v3:2
+# CHECK-NEXT: 1f42c3e1 { v0:1 = vcombine(v3,v2) }
+# CHECK-V65: error: register pair `WR0' is not permitted for this architecture
+
+# Mapped instruction's swapped use:
+v1:0 = v2:3
+# CHECK-NEXT: v1:0 = vcombine(v2,v3)
+## No error for v65, this is now permitted!
+
+## .new producer from pair:
+{
+   v0:1 = vaddw(v0:1, v0:1)
+   if (!p0) vmem(r0+#0)=v0.new
+}
+# CHECK-NEXT: v0:1.w = vadd(v0:1.w,v0:1.w)
+# CHECK-NEXT: if (!p0) vmem(r0+#0) = v0.new
+# CHECK-V65: error: register pair `WR0' is not permitted for this architecture
+
+## Used .tmp, swapped use & def:
+{
+  v0.tmp = vmem(r0 + #3)
+  v2:3 = vaddw(v0:1, v0:1)
+}
+# CHECK-NEXT: 1c6141c3 { v2:3.w = vadd(v0:1.w,v0:1.w)
+# CHECK-NEXT:            v0.tmp = vmem(r0+#3) }
+# CHECK-V65: error: register pair `WR0' is not permitted for this architecture
+# CHECK-V65: error: register pair `WR1' is not permitted for this architecture
diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
index 49144beec3afc..28137f646ace5 100644
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -101,16 +101,17 @@
 ; CHECK-NEXT:         Simplify the CFG
 ; CHECK-NEXT:         Reassociate expressions
 ; CHECK-NEXT:         Dominator Tree Construction
-; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:         Function Alias Analysis Results
-; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         Natural Loop Information
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Scalar Evolution Analysis
 ; CHECK-NEXT:         Loop Pass Manager
 ; CHECK-NEXT:           Rotate Loops
+; CHECK-NEXT:         Memory SSA
+; CHECK-NEXT:         Loop Pass Manager
 ; CHECK-NEXT:           Loop Invariant Code Motion
 ; CHECK-NEXT:           Unswitch loops
 ; CHECK-NEXT:         Simplify the CFG
@@ -198,13 +199,12 @@
 ; CHECK-NEXT:       Float to int
 ; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Dominator Tree Construction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Memory SSA
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       LCSSA Verifier
 ; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Loop Pass Manager
 ; CHECK-NEXT:         Rotate Loops
diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
index cbb90eef5ef7c..61b5b9c973b87 100644
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -106,16 +106,17 @@
 ; CHECK-NEXT:         Simplify the CFG
 ; CHECK-NEXT:         Reassociate expressions
 ; CHECK-NEXT:         Dominator Tree Construction
-; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:         Function Alias Analysis Results
-; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         Natural Loop Information
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Scalar Evolution Analysis
 ; CHECK-NEXT:         Loop Pass Manager
 ; CHECK-NEXT:           Rotate Loops
+; CHECK-NEXT:         Memory SSA
+; CHECK-NEXT:         Loop Pass Manager
 ; CHECK-NEXT:           Loop Invariant Code Motion
 ; CHECK-NEXT:           Unswitch loops
 ; CHECK-NEXT:         Simplify the CFG
@@ -203,13 +204,12 @@
 ; CHECK-NEXT:       Float to int
 ; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Dominator Tree Construction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Memory SSA
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       LCSSA Verifier
 ; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Loop Pass Manager
 ; CHECK-NEXT:         Rotate Loops
diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
index ce3801388b1e6..81f82d080c709 100644
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -88,16 +88,17 @@
 ; CHECK-NEXT:         Simplify the CFG
 ; CHECK-NEXT:         Reassociate expressions
 ; CHECK-NEXT:         Dominator Tree Construction
-; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:         Function Alias Analysis Results
-; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         Natural Loop Information
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Scalar Evolution Analysis
 ; CHECK-NEXT:         Loop Pass Manager
 ; CHECK-NEXT:           Rotate Loops
+; CHECK-NEXT:         Memory SSA
+; CHECK-NEXT:         Loop Pass Manager
 ; CHECK-NEXT:           Loop Invariant Code Motion
 ; CHECK-NEXT:           Unswitch loops
 ; CHECK-NEXT:         Simplify the CFG
@@ -185,13 +186,12 @@
 ; CHECK-NEXT:       Float to int
 ; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Dominator Tree Construction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Memory SSA
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       LCSSA Verifier
 ; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Loop Pass Manager
 ; CHECK-NEXT:         Rotate Loops
diff --git a/llvm/test/Other/pass-pipelines.ll b/llvm/test/Other/pass-pipelines.ll
index b3887cf6f969c..718ca46e2ed2a 100644
--- a/llvm/test/Other/pass-pipelines.ll
+++ b/llvm/test/Other/pass-pipelines.ll
@@ -54,6 +54,7 @@
 ; CHECK-O2-NEXT: FunctionPass Manager
 ; CHECK-O2-NOT: Manager
 ; CHECK-O2: Loop Pass Manager
+; CHECK-O2: Loop Pass Manager
 ; CHECK-O2-NOT: Manager
 ; FIXME: We shouldn't be pulling out to simplify-cfg and instcombine and
 ; causing new loop pass managers.
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll
new file mode 100644
index 0000000000000..bf42d2f5ff646
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+
+; GCN-LABEL: @bswap_v2i16(
+; GFX7: call i16 @llvm.bswap.i16(
+; GFX7: call i16 @llvm.bswap.i16(
+
+; GFX8: call <2 x i16> @llvm.bswap.v2i16(
+define <2 x i16> @bswap_v2i16(<2 x i16> %arg) {
+bb:
+  %tmp = extractelement <2 x i16> %arg, i64 0
+  %tmp1 = tail call i16 @llvm.bswap.i16(i16 %tmp)
+  %tmp2 = insertelement <2 x i16> undef, i16 %tmp1, i64 0
+  %tmp3 = extractelement <2 x i16> %arg, i64 1
+  %tmp4 = tail call i16 @llvm.bswap.i16(i16 %tmp3)
+  %tmp5 = insertelement <2 x i16> %tmp2, i16 %tmp4, i64 1
+  ret <2 x i16> %tmp5
+}
+
+; GCN-LABEL: @bswap_v2i32(
+; GCN: call i32 @llvm.bswap.i32
+; GCN: call i32 @llvm.bswap.i32
+define <2 x i32> @bswap_v2i32(<2 x i32> %arg) {
+bb:
+  %tmp = extractelement <2 x i32> %arg, i64 0
+  %tmp1 = tail call i32 @llvm.bswap.i32(i32 %tmp)
+  %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i64 0
+  %tmp3 = extractelement <2 x i32> %arg, i64 1
+  %tmp4 = tail call i32 @llvm.bswap.i32(i32 %tmp3)
+  %tmp5 = insertelement <2 x i32> %tmp2, i32 %tmp4, i64 1
+  ret <2 x i32> %tmp5
+}
+
+declare i16 @llvm.bswap.i16(i16) #0
+declare i32 @llvm.bswap.i32(i32) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }
diff --git a/llvm/test/tools/UpdateTestChecks/lit.local.cfg b/llvm/test/tools/UpdateTestChecks/lit.local.cfg
index 74164e808e476..d9d11b5a06c07 100644
--- a/llvm/test/tools/UpdateTestChecks/lit.local.cfg
+++ b/llvm/test/tools/UpdateTestChecks/lit.local.cfg
@@ -42,11 +42,3 @@ if os.path.isfile(llvm_mca_path):
     config.available_features.add('llvm-mca-binary')
     mca_arg = '--llvm-mca-binary ' + shell_quote(llvm_mca_path)
     add_update_script_substition('%update_test_checks', extra_args=mca_arg)
-
-clang_path = os.path.join(config.llvm_tools_dir, 'clang')
-if os.path.isfile(clang_path):
-    config.available_features.add('clang-binary')
-    extra_args = '--clang ' + shell_quote(clang_path)
-    if os.path.isfile(opt_path):
-        extra_args += ' --opt ' + shell_quote(opt_path)
-    add_update_script_substition('%update_cc_test_checks', extra_args=extra_args)
diff --git a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/lit.local.cfg b/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/lit.local.cfg
deleted file mode 100644
index 99346daabcb06..0000000000000
--- a/llvm/test/tools/UpdateTestChecks/update_cc_test_checks/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-# These tests require clang.
-if 'clang-binary' not in config.available_features:
-    config.unsupported = True
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index 6b2608160aea9..54adeaa11c1a2 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -258,7 +258,7 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) {
   if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads))
     Options.LinkOpts.Threads = atoi(NumThreads->getValue());
   else
-    Options.LinkOpts.Threads = thread::hardware_concurrency();
+    Options.LinkOpts.Threads = 0; // Use all available hardware threads
 
   if (Options.DumpDebugMap || Options.LinkOpts.Verbose)
     Options.LinkOpts.Threads = 1;
@@ -541,9 +541,10 @@ int main(int argc, char **argv) {
     // Shared a single binary holder for all the link steps.
     BinaryHolder BinHolder;
 
-    unsigned ThreadCount =
-        std::min<unsigned>(Options.LinkOpts.Threads, DebugMapPtrsOrErr->size());
-    ThreadPool Threads(ThreadCount);
+    unsigned ThreadCount = Options.LinkOpts.Threads;
+    if (!ThreadCount)
+      ThreadCount = DebugMapPtrsOrErr->size();
+    ThreadPool Threads(hardware_concurrency(ThreadCount));
 
     // If there is more than one link to execute, we need to generate
     // temporary files.
diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp
index f68f7183e034c..2a9398fed9f35 100644
--- a/llvm/tools/gold/gold-plugin.cpp
+++ b/llvm/tools/gold/gold-plugin.cpp
@@ -134,8 +134,8 @@ namespace options {
   static unsigned OptLevel = 2;
   // Default parallelism of 0 used to indicate that user did not specify.
   // Actual parallelism default value depends on implementation.
-  // Currently only affects ThinLTO, where the default is
-  // llvm::heavyweight_hardware_concurrency.
+  // Currently only affects ThinLTO, where the default is the max cores in the
+  // system.
   static unsigned Parallelism = 0;
   // Default regular LTO codegen parallelism (number of partitions).
   static unsigned ParallelCodeGenParallelismLevel = 1;
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index 52e9958e92da9..625e2342e4bb3 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -947,9 +947,7 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
 
   // If NumThreads is not specified, auto-detect a good default.
   if (NumThreads == 0)
-    NumThreads =
-        std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
-                              unsigned(SourceFiles.size())));
+    NumThreads = SourceFiles.size();
 
   if (!ViewOpts.hasOutputDirectory() || NumThreads == 1) {
     for (const std::string &SourceFile : SourceFiles)
@@ -957,7 +955,7 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
                           ShowFilenames);
   } else {
     // In -output-dir mode, it's safe to use multiple threads to print files.
-    ThreadPool Pool(NumThreads);
+    ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
     for (const std::string &SourceFile : SourceFiles)
       Pool.async(&CodeCoverageTool::writeSourceFileView, this, SourceFile,
                  Coverage.get(), Printer.get(), ShowFilenames);
diff --git a/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/llvm/tools/llvm-cov/CoverageExporterJson.cpp
index 216b5e3fd2263..ba8ff5c8fe523 100644
--- a/llvm/tools/llvm-cov/CoverageExporterJson.cpp
+++ b/llvm/tools/llvm-cov/CoverageExporterJson.cpp
@@ -163,11 +163,9 @@ json::Array renderFiles(const coverage::CoverageMapping &Coverage,
                         ArrayRef<FileCoverageSummary> FileReports,
                         const CoverageViewOptions &Options) {
   auto NumThreads = Options.NumThreads;
-  if (NumThreads == 0) {
-    NumThreads = std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
-                                       unsigned(SourceFiles.size())));
-  }
-  ThreadPool Pool(NumThreads);
+  if (NumThreads == 0)
+    NumThreads = SourceFiles.size();
+  ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
   json::Array FileArray;
   std::mutex FileArrayMutex;
 
diff --git a/llvm/tools/llvm-cov/CoverageReport.cpp b/llvm/tools/llvm-cov/CoverageReport.cpp
index 82259542c5970..187e2dc4f553a 100644
--- a/llvm/tools/llvm-cov/CoverageReport.cpp
+++ b/llvm/tools/llvm-cov/CoverageReport.cpp
@@ -356,11 +356,8 @@ std::vector<FileCoverageSummary> CoverageReport::prepareFileReports(
 
   // If NumThreads is not specified, auto-detect a good default.
   if (NumThreads == 0)
-    NumThreads =
-        std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
-                              unsigned(Files.size())));
-
-  ThreadPool Pool(NumThreads);
+    NumThreads = Files.size();
+  ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
 
   std::vector<FileCoverageSummary> FileReports;
   FileReports.reserve(Files.size());
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h
index d1e20b9b36a83..65a3fe61aecb0 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h
+++ b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h
@@ -162,11 +162,28 @@ class CombinationGenerator {
     SmallVector<WrappingIterator<choice_type>, variable_smallsize>
         VariablesState;
 
+    // 'increment' of the the whole VariablesState is defined identically to the
+    // increment of a number: starting from the least significant element,
+    // increment it, and if it wrapped, then propagate that carry by also
+    // incrementing next (more significant) element.
+    auto IncrementState =
+        [](MutableArrayRef<WrappingIterator<choice_type>> VariablesState)
+        -> bool {
+      for (WrappingIterator<choice_type> &Variable :
+           llvm::reverse(VariablesState)) {
+        bool Wrapped = ++Variable;
+        if (!Wrapped)
+          return false; // There you go, next combination is ready.
+        // We have carry - increment more significant variable next..
+      }
+      return true; // MSB variable wrapped, no more unique combinations.
+    };
+
     // Initialize the per-variable state to refer to the possible choices for
     // that variable.
     VariablesState.reserve(VariablesChoices.size());
-    for (ArrayRef<choice_type> VariablesChoices : VariablesChoices)
-      VariablesState.emplace_back(VariablesChoices);
+    for (ArrayRef<choice_type> VC : VariablesChoices)
+      VariablesState.emplace_back(VC);
 
     // Temporary buffer to store each combination before performing Callback.
     SmallVector<choice_type, variable_smallsize> CurrentCombination;
@@ -179,23 +196,9 @@ class CombinationGenerator {
       // And pass the new combination into callback, as intended.
       if (/*Abort=*/Callback(CurrentCombination))
         return;
-
-      // 'increment' the whole VariablesState, much like you would increment
-      // a number: starting from the least significant element, increment it,
-      // and if it wrapped, then propagate that carry by also incrementing next
-      // (more significant) element.
-      for (WrappingIterator<choice_type> &VariableState :
-           llvm::reverse(VariablesState)) {
-        bool Wrapped = ++VariableState;
-        if (!Wrapped)
-          break;
-
-        if (VariablesState.begin() == &VariableState)
-          return; // The "most significant" variable has wrapped, which means
-                  // that we have produced all the combinations.
-
-        // We have carry - increment more significant variable next..
-      }
+      // And tick the state to next combination, which will be unique.
+      if (IncrementState(VariablesState))
+        return; // All combinations produced.
     }
   };
 
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index 8deedd49e0501..fc86fd969efba 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -65,8 +65,8 @@ static cl::opt<bool>
                                        "import files for the "
                                        "distributed backend case"));
 
-static cl::opt<int> Threads("thinlto-threads",
-                            cl::init(llvm::heavyweight_hardware_concurrency()));
+// Default to using all hardware cores in the system.
+static cl::opt<int> Threads("thinlto-threads", cl::init(0));
 
 static cl::list<std::string> SymbolResolutions(
     "r",
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 424edf446d035..f05c7e637cd55 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -307,8 +307,11 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
 
   // If NumThreads is not specified, auto-detect a good default.
   if (NumThreads == 0)
-    NumThreads =
-        std::min(hardware_concurrency(), unsigned((Inputs.size() + 1) / 2));
+    NumThreads = std::min(hardware_concurrency().compute_thread_count(),
+                          unsigned((Inputs.size() + 1) / 2));
+  // FIXME: There's a bug here, where setting NumThreads = Inputs.size() fails
+  // the merge_empty_profile.test because the InstrProfWriter.ProfileKind isn't
+  // merged, thus the emitted file ends up with a PF_Unknown kind.
 
   // Initialize the writer contexts.
   SmallVector<std::unique_ptr<WriterContext>, 4> Contexts;
@@ -320,7 +323,7 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
     for (const auto &Input : Inputs)
       loadInput(Input, Remapper, Contexts[0].get());
   } else {
-    ThreadPool Pool(NumThreads);
+    ThreadPool Pool(hardware_concurrency(NumThreads));
 
     // Load the inputs in parallel (N/NumThreads serial steps).
     unsigned Ctx = 0;
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index 8191ec86e61fa..c3a9bda3c817a 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -1815,6 +1815,14 @@ TEST(APIntTest, SelfMoveAssignment) {
 #endif
 #endif // _MSC_VER
 
+TEST(APIntTest, byteSwap) {
+  EXPECT_EQ(0x00000000, APInt(16, 0x0000).byteSwap());
+  EXPECT_EQ(0x0000010f, APInt(16, 0x0f01).byteSwap());
+  EXPECT_EQ(0x117700ff, APInt(32, 0xff007711).byteSwap());
+  EXPECT_EQ(0x050403020100ULL, APInt(48, 0x000102030405ULL).byteSwap());
+  EXPECT_EQ(0xff050403020100aaULL, APInt(64, 0xaa000102030405ffULL).byteSwap());
+}
+
 TEST(APIntTest, reverseBits) {
   EXPECT_EQ(1, APInt(1, 1).reverseBits());
   EXPECT_EQ(0, APInt(1, 0).reverseBits());
diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp
index 5d3830972a626..c933388361411 100644
--- a/llvm/unittests/ADT/BitVectorTest.cpp
+++ b/llvm/unittests/ADT/BitVectorTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "gtest/gtest.h"
 
@@ -1149,4 +1150,40 @@ TYPED_TEST(BitVectorTest, PushBack) {
   EXPECT_EQ(213U, Vec.size());
   EXPECT_EQ(102U, Vec.count());
 }
+
+TYPED_TEST(BitVectorTest, DenseSet) {
+  DenseSet<TypeParam> Set;
+  TypeParam A(10, true);
+  auto I = Set.insert(A);
+  EXPECT_EQ(true, I.second);
+
+  TypeParam B(5, true);
+  I = Set.insert(B);
+  EXPECT_EQ(true, I.second);
+
+  TypeParam C(20, false);
+  C.set(19);
+  I = Set.insert(C);
+  EXPECT_EQ(true, I.second);
+
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  TypeParam D;
+  EXPECT_DEATH(Set.insert(D),
+               "Empty/Tombstone value shouldn't be inserted into map!");
+#endif
+
+  EXPECT_EQ(3U, Set.size());
+  EXPECT_EQ(1U, Set.count(A));
+  EXPECT_EQ(1U, Set.count(B));
+  EXPECT_EQ(1U, Set.count(C));
+
+  EXPECT_EQ(true, Set.erase(B));
+  EXPECT_EQ(2U, Set.size());
+
+  EXPECT_EQ(true, Set.erase(C));
+  EXPECT_EQ(1U, Set.size());
+
+  EXPECT_EQ(true, Set.erase(A));
+  EXPECT_EQ(0U, Set.size());
 }
+} // namespace
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
index 0552b5ad6f7ba..a622c84c1f4d6 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
@@ -670,7 +670,7 @@ TEST_F(DebugLineBasicFixture,
   ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 3u);
   EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u);
   // Show that the set address opcode is ignored in this case.
-  EXPECT_EQ((*ExpectedLineTable)->Rows[0].Address.Address, 0);
+  EXPECT_EQ((*ExpectedLineTable)->Rows[0].Address.Address, 0u);
 }
 
 TEST_F(DebugLineBasicFixture, ErrorForAddressSizeGreaterThanByteSize) {
@@ -731,7 +731,7 @@ TEST_F(DebugLineBasicFixture, ErrorForUnsupportedAddressSizeDefinedInHeader) {
   ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 3u);
   EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u);
   // Show that the set address opcode is ignored in this case.
-  EXPECT_EQ((*ExpectedLineTable)->Rows[0].Address.Address, 0);
+  EXPECT_EQ((*ExpectedLineTable)->Rows[0].Address.Address, 0u);
 }
 
 TEST_F(DebugLineBasicFixture, CallbackUsedForUnterminatedSequence) {
diff --git a/llvm/unittests/Frontend/OpenMPContextTest.cpp b/llvm/unittests/Frontend/OpenMPContextTest.cpp
index 8741b825cb61e..eb505be042cb3 100644
--- a/llvm/unittests/Frontend/OpenMPContextTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPContextTest.cpp
@@ -38,12 +38,11 @@ TEST_F(OpenMPContextTest, RoundTripAndAssociation) {
 #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
   EXPECT_EQ(TraitProperty::Enum,                                               \
             getOpenMPContextTraitPropertyKind(                                 \
-                TraitSet::TraitSetEnum, TraitSelector::TraitSelectorEnum,      \
+                TraitSet::TraitSetEnum,                                        \
                 getOpenMPContextTraitPropertyName(TraitProperty::Enum)));      \
-  EXPECT_EQ(                                                                   \
-      Str,                                                                     \
-      getOpenMPContextTraitPropertyName(getOpenMPContextTraitPropertyKind(     \
-          TraitSet::TraitSetEnum, TraitSelector::TraitSelectorEnum, Str)));    \
+  EXPECT_EQ(Str, getOpenMPContextTraitPropertyName(                            \
+                     getOpenMPContextTraitPropertyKind(TraitSet::TraitSetEnum, \
+                                                       Str)));                 \
   EXPECT_EQ(TraitSet::TraitSetEnum,                                            \
             getOpenMPContextTraitSetForProperty(TraitProperty::Enum));         \
   EXPECT_EQ(TraitSelector::TraitSelectorEnum,                                  \
diff --git a/llvm/unittests/Support/Host.cpp b/llvm/unittests/Support/Host.cpp
index 2c17a5094101e..62252347d62aa 100644
--- a/llvm/unittests/Support/Host.cpp
+++ b/llvm/unittests/Support/Host.cpp
@@ -37,7 +37,8 @@ class HostTest : public testing::Test {
     // Initially this is only testing detection of the number of
     // physical cores, which is currently only supported/tested for
     // x86_64 Linux and Darwin.
-    return (Host.getArch() == Triple::x86_64 &&
+    return Host.isOSWindows() ||
+           (Host.getArch() == Triple::x86_64 &&
             (Host.isOSDarwin() || Host.getOS() == Triple::Linux));
   }
 
diff --git a/llvm/unittests/Support/TaskQueueTest.cpp b/llvm/unittests/Support/TaskQueueTest.cpp
index 0a8aeca4e2d6f..4d8c3e4064b49 100644
--- a/llvm/unittests/Support/TaskQueueTest.cpp
+++ b/llvm/unittests/Support/TaskQueueTest.cpp
@@ -22,7 +22,7 @@ class TaskQueueTest : public testing::Test {
 };
 
 TEST_F(TaskQueueTest, OrderedFutures) {
-  ThreadPool TP(1);
+  ThreadPool TP(hardware_concurrency(1));
   TaskQueue TQ(TP);
   std::atomic<int> X{ 0 };
   std::atomic<int> Y{ 0 };
@@ -66,7 +66,7 @@ TEST_F(TaskQueueTest, OrderedFutures) {
 }
 
 TEST_F(TaskQueueTest, UnOrderedFutures) {
-  ThreadPool TP(1);
+  ThreadPool TP(hardware_concurrency(1));
   TaskQueue TQ(TP);
   std::atomic<int> X{ 0 };
   std::atomic<int> Y{ 0 };
@@ -96,7 +96,7 @@ TEST_F(TaskQueueTest, UnOrderedFutures) {
 }
 
 TEST_F(TaskQueueTest, FutureWithReturnValue) {
-  ThreadPool TP(1);
+  ThreadPool TP(hardware_concurrency(1));
   TaskQueue TQ(TP);
   std::future<std::string> F1 = TQ.async([&] { return std::string("Hello"); });
   std::future<int> F2 = TQ.async([&] { return 42; });
diff --git a/llvm/unittests/Support/ThreadPool.cpp b/llvm/unittests/Support/ThreadPool.cpp
index a16adbbb78a75..237be875909b7 100644
--- a/llvm/unittests/Support/ThreadPool.cpp
+++ b/llvm/unittests/Support/ThreadPool.cpp
@@ -8,11 +8,13 @@
 
 #include "llvm/Support/ThreadPool.h"
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/Threading.h"
 
 #include "gtest/gtest.h"
 
@@ -69,6 +71,8 @@ class ThreadPoolTest : public testing::Test {
 
   void SetUp() override { MainThreadReady = false; }
 
+  void TestAllThreads(ThreadPoolStrategy S);
+
   std::condition_variable WaitMainThread;
   std::mutex WaitMainThreadMutex;
   bool MainThreadReady = false;
@@ -131,7 +135,7 @@ TEST_F(ThreadPoolTest, Async) {
 
 TEST_F(ThreadPoolTest, GetFuture) {
   CHECK_UNSUPPORTED();
-  ThreadPool Pool{2};
+  ThreadPool Pool(hardware_concurrency(2));
   std::atomic_int i{0};
   Pool.async([this, &i] {
     waitForMainThread();
@@ -162,3 +166,45 @@ TEST_F(ThreadPoolTest, PoolDestruction) {
   }
   ASSERT_EQ(5, checked_in);
 }
+
+#if LLVM_ENABLE_THREADS == 1
+
+void ThreadPoolTest::TestAllThreads(ThreadPoolStrategy S) {
+  // FIXME: Skip these tests on non-Windows because multi-socket system were not
+  // tested on Unix yet, and llvm::get_thread_affinity_mask() isn't implemented
+  // for Unix.
+  Triple Host(Triple::normalize(sys::getProcessTriple()));
+  if (!Host.isOSWindows())
+    return;
+
+  llvm::DenseSet<llvm::BitVector> ThreadsUsed;
+  std::mutex Lock;
+  unsigned Threads = 0;
+  {
+    ThreadPool Pool(S);
+    Threads = Pool.getThreadCount();
+    for (size_t I = 0; I < 10000; ++I) {
+      Pool.async([&] {
+        waitForMainThread();
+        std::lock_guard<std::mutex> Guard(Lock);
+        auto Mask = llvm::get_thread_affinity_mask();
+        ThreadsUsed.insert(Mask);
+      });
+    }
+    ASSERT_EQ(true, ThreadsUsed.empty());
+    setMainThreadReady();
+  }
+  ASSERT_EQ(llvm::get_cpus(), ThreadsUsed.size());
+}
+
+TEST_F(ThreadPoolTest, AllThreads_UseAllRessources) {
+  CHECK_UNSUPPORTED();
+  TestAllThreads({});
+}
+
+TEST_F(ThreadPoolTest, AllThreads_OneThreadPerCore) {
+  CHECK_UNSUPPORTED();
+  TestAllThreads(llvm::heavyweight_hardware_concurrency());
+}
+
+#endif
diff --git a/llvm/unittests/Support/Threading.cpp b/llvm/unittests/Support/Threading.cpp
index 183c9aa7d71c4..c76e6e4a5bd17 100644
--- a/llvm/unittests/Support/Threading.cpp
+++ b/llvm/unittests/Support/Threading.cpp
@@ -21,7 +21,8 @@ TEST(Threading, PhysicalConcurrency) {
   auto Num = heavyweight_hardware_concurrency();
   // Since Num is unsigned this will also catch us trying to
   // return -1.
-  ASSERT_LE(Num, thread::hardware_concurrency());
+  ASSERT_LE(Num.compute_thread_count(),
+            hardware_concurrency().compute_thread_count());
 }
 
 #if LLVM_ENABLE_THREADS
diff --git a/llvm/utils/gn/build/BUILD.gn b/llvm/utils/gn/build/BUILD.gn
index 7947a367cf288..b74268996f150 100644
--- a/llvm/utils/gn/build/BUILD.gn
+++ b/llvm/utils/gn/build/BUILD.gn
@@ -59,8 +59,8 @@ config("compiler_defaults") {
     }
     if (is_optimized) {
       cflags += [
-        # FIXME: evaluate /Gw (not part of /O2)
         "/O2",
+        "/Gw",
         "/Zc:inline",
       ]
       ldflags += [
diff --git a/llvm/utils/update_cc_test_checks.py b/llvm/utils/update_cc_test_checks.py
index 21cc5b4e5e302..9b236dbd24319 100755
--- a/llvm/utils/update_cc_test_checks.py
+++ b/llvm/utils/update_cc_test_checks.py
@@ -292,10 +292,10 @@ def main():
                                  False, args.function_signature)
       output_lines.append(line.rstrip('\n'))
 
-    # Update the test file.
-    with open(filename, 'w') as f:
-      for line in output_lines:
-        f.write(line + '\n')
+
+    common.debug('Writing %d lines to %s...' % (len(output_lines), filename))
+    with open(filename, 'wb') as f:
+      f.writelines(['{}\n'.format(l).encode('utf-8') for l in output_lines])
 
   return 0
 
diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
index 53f2fea1ccfc6..cf6316cae643a 100644
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -171,9 +171,11 @@ struct AffineCopyOptions {
 /// by its root affine.for. Since we generate alloc's and dealloc's for all fast
 /// buffers (before and after the range of operations resp. or at a hoisted
 /// position), all of the fast memory capacity is assumed to be available for
-/// processing this block range.
+/// processing this block range. When 'filterMemRef' is specified, copies are
+/// only generated for the provided MemRef.
 uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
                                 const AffineCopyOptions &copyOptions,
+                                Optional<Value> filterMemRef,
                                 DenseSet<Operation *> &copyNests);
 
 /// Tile a nest of standard for loops rooted at `rootForOp` by finding such
@@ -220,6 +222,11 @@ void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
 /// ```
 void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId,
                            ArrayRef<Value> numProcessors);
+
+/// Gathers all AffineForOps in 'func' grouped by loop depth.
+void gatherLoops(FuncOp func,
+                 DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops);
+
 } // end namespace mlir
 
 #endif // MLIR_TRANSFORMS_LOOP_UTILS_H
diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
index fc40ff18ce06f..41adb623e74d6 100644
--- a/mlir/lib/Pass/Pass.cpp
+++ b/mlir/lib/Pass/Pass.cpp
@@ -411,7 +411,8 @@ void OpToOpPassAdaptorParallel::runOnOperation() {
   // Create the async executors if they haven't been created, or if the main
   // pipeline has changed.
   if (asyncExecutors.empty() || hasSizeMismatch(asyncExecutors.front(), mgrs))
-    asyncExecutors.assign(llvm::hardware_concurrency(), mgrs);
+    asyncExecutors.assign(llvm::hardware_concurrency().compute_thread_count(),
+                          mgrs);
 
   // Run a prepass over the module to collect the operations to execute over.
   // This ensures that an analysis manager exists for each operation, as well as
diff --git a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
index 449dcfafeceb0..5409c557da83e 100644
--- a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
+++ b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
@@ -179,7 +179,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block,
     if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
       // Perform the copying up unti this 'for' op first.
       affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
-                             copyNests);
+                             /*filterMemRef=*/llvm::None, copyNests);
 
       // Returns true if the footprint is known to exceed capacity.
       auto exceedsCapacity = [&](AffineForOp forOp) {
@@ -213,7 +213,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block,
         // consumed capacity. The footprint check above guarantees this inner
         // loop's footprint fits.
         affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions,
-                               copyNests);
+                               /*filterMemRef=*/llvm::None, copyNests);
       }
       // Get to the next load or store op after 'forOp'.
       curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
@@ -236,7 +236,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block,
     assert(!curBegin->isKnownTerminator() && "can't be a terminator");
     // Exclude the affine terminator - hence, the std::prev.
     affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()),
-                           copyOptions, copyNests);
+                           copyOptions, /*filterMemRef=*/llvm::None, copyNests);
   }
 
   return success();
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index 56f954f214225..da3d819cbc3ed 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -1585,16 +1585,21 @@ static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs,
   return true;
 }
 
-/// Generates copies for a contiguous sequence of operations in `block` in the
-/// iterator range [`begin', `end'), where `end' can't be past the terminator of
-/// the block (since additional operations are potentially inserted right before
-/// `end'. Returns the total size of the fast buffers used.
-//  Since we generate alloc's and dealloc's for all fast buffers (before and
-//  after the range of operations resp.), all of the fast memory capacity is
-//  assumed to be available for processing this block range.
+/// Performs explicit copying for the contiguous sequence of operations in the
+/// block iterator range [`begin', `end'), where `end' can't be past the
+/// terminator of the block (since additional operations are potentially
+/// inserted right before `end`. Returns the total size of fast memory space
+/// buffers used. `copyOptions` provides various parameters, and the output
+/// argument `copyNests` is the set of all copy nests inserted, each represented
+/// by its root affine.for. Since we generate alloc's and dealloc's for all fast
+/// buffers (before and after the range of operations resp. or at a hoisted
+/// position), all of the fast memory capacity is assumed to be available for
+/// processing this block range. When 'filterMemRef' is specified, copies are
+/// only generated for the provided MemRef.
 uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
                                       Block::iterator end,
                                       const AffineCopyOptions &copyOptions,
+                                      Optional<Value> filterMemRef,
                                       DenseSet<Operation *> &copyNests) {
   if (begin == end)
     return 0;
@@ -1631,12 +1636,14 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
   block->walk(begin, end, [&](Operation *opInst) {
     // Gather regions to allocate to buffers in faster memory space.
     if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
-      if ((loadOp.getMemRefType().getMemorySpace() !=
+      if ((filterMemRef.hasValue() && filterMemRef != loadOp.getMemRef()) ||
+          (loadOp.getMemRefType().getMemorySpace() !=
            copyOptions.slowMemorySpace))
         return;
     } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
-      if (storeOp.getMemRefType().getMemorySpace() !=
-          copyOptions.slowMemorySpace)
+      if ((filterMemRef.hasValue() && filterMemRef != storeOp.getMemRef()) ||
+          storeOp.getMemRefType().getMemorySpace() !=
+              copyOptions.slowMemorySpace)
         return;
     } else {
       // Neither load nor a store op.
@@ -1776,3 +1783,24 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
 
   return totalCopyBuffersSizeInBytes;
 }
+
+/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
+static void gatherLoopsInBlock(
+    Block *block, unsigned currLoopDepth,
+    DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
+  auto &loopsAtDepth = depthToLoops[currLoopDepth];
+  for (auto &op : *block) {
+    if (auto forOp = dyn_cast<AffineForOp>(op)) {
+      loopsAtDepth.push_back(forOp);
+      gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
+    }
+  }
+}
+
+/// Gathers all AffineForOps in 'func' grouped by loop depth.
+void mlir::gatherLoops(
+    FuncOp func,
+    DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
+  for (auto &block : func)
+    gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
+}
diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir
index c83beb183021b..b2e4fbbf76c19 100644
--- a/mlir/test/Transforms/affine-data-copy.mlir
+++ b/mlir/test/Transforms/affine-data-copy.mlir
@@ -2,6 +2,12 @@
 // Small buffer size to trigger fine copies.
 // RUN: mlir-opt %s -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s
 
+// Test affine data copy with a memref filter. We use a test pass that invokes
+// affine data copy utility on the input loop nest.
+// '-test-affine-data-copy-memref-filter' passes the first memref found in an
+// affine.load op in the innermost loop as a filter.
+// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER
+
 // -copy-skip-non-stride-loops forces the copies to be placed right inside the
 // tile space loops, avoiding the sensitivity of copy placement depth to memory
 // footprint -- so that one could write a definite test case and not have to
@@ -16,6 +22,7 @@
 // CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)>
 
 // CHECK-LABEL: func @matmul
+// FILTER-LABEL: func @matmul
 func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> {
   affine.for %i = 0 to 4096 step 128 {
     affine.for %j = 0 to 4096 step 128 {
@@ -110,11 +117,29 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
 // CHECK:   }
 // CHECK: }
 
+// Check that only one memref is copied when memref filter is used.
+
+//      FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
+//      FILTER:   alloc() : memref<128x4096xf32>
+//  FILTER-NOT:   alloc()
+//      FILTER:   affine.for %{{.*}} = 0 to 128 {
+//      FILTER:     affine.for %{{.*}} = 0 to 4096 {
+//      FILTER:     affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER-NEXT:       affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER-NEXT:         affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+// FILTER-NEXT:           affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+// FILTER-NEXT:             affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+//      FILTER:   dealloc %1 : memref<128x4096xf32>
+//  FILTER-NOT:   dealloc %1 : memref<128x4096xf32>
+
+// -----
+
 //
 // This test case will lead to single element buffers. These are eventually
 // expected to be turned into registers via alloca and mem2reg.
 //
-// CHECK-SMALL: func @foo
+// CHECK-SMALL-LABEL: func @foo
+// FILTER-LABEL: func @foo
 func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
   affine.for %i = 0 to 1024 {
     affine.for %j = 0 to 1024 {
@@ -161,3 +186,15 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem
 // CHECK-SMALL:   }
 // CHECK-SMALL: }
 // CHECK-SMALL: return
+
+// Check that only one memref is copied when memref filter is used.
+
+//      FILTER: alloc() : memref<1024x1024xf32>
+//  FILTER-NOT: alloc()
+//      FILTER: affine.for %{{.*}} = 0 to 1024 {
+//      FILTER:   affine.for %{{.*}} = 0 to 1024 {
+//      FILTER: affine.for %{{.*}} = 0 to 1024 {
+// FILTER-NEXT:   affine.for %{{.*}} = 0 to 1024 {
+// FILTER-NEXT:     affine.for %{{.*}} = 0 to 1024 {
+//      FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
+//  FILTER-NOT: dealloc
diff --git a/mlir/test/Transforms/dma-generate.mlir b/mlir/test/Transforms/dma-generate.mlir
index 9724f990f97ca..b1e71e694690b 100644
--- a/mlir/test/Transforms/dma-generate.mlir
+++ b/mlir/test/Transforms/dma-generate.mlir
@@ -543,7 +543,7 @@ func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>,
 // CHECK:         affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
 // CHECK:         affine.for %{{.*}} =
 
-// ----
+// -----
 
 #map3 = affine_map<(d0) -> (d0)>
 #map12 = affine_map<(d0) -> (d0 + 3)>
@@ -551,6 +551,7 @@ func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>,
 #map15 = affine_map<(d0, d1) -> ((d0 + d1 * 72) mod 2304 - (((d0 + d1 * 72) mod 2304) floordiv 1152) * 1151 - ((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3) * 3)>
 #map16 = affine_map<(d0, d1) -> (((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) floordiv 8)>
 // Test for test case in b/128303048 #4.
+// CHECK-LABEL: func @test_memref_bounds
 func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memref<144x9xvector<8x128xf32>>, %arg2: memref<2xvector<8x128xf32>>) -> (memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>) {
   %c0 = constant 0 : index
   affine.for %i8 = 0 to 9 step 3 {
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 47a0dd92cd062..8c422e718f1fd 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(MLIRTestTransforms
+  TestAffineDataCopy.cpp
   TestAllReduceLowering.cpp
   TestCallGraph.cpp
   TestConstantFold.cpp
diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
new file mode 100644
index 0000000000000..e03d45cb9dd45
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
@@ -0,0 +1,86 @@
+//===- TestAffineDataCopy.cpp - Test affine data copy utility -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to test affine data copy utility functions and
+// options.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#define PASS_NAME "test-affine-data-copy"
+
+using namespace mlir;
+
+static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options");
+
+namespace {
+
+struct TestAffineDataCopy : public FunctionPass<TestAffineDataCopy> {
+  TestAffineDataCopy() = default;
+  TestAffineDataCopy(const TestAffineDataCopy &pass){};
+
+  void runOnFunction() override;
+
+private:
+  Option<bool> clMemRefFilter{
+      *this, "memref-filter",
+      llvm::cl::desc(
+          "Enable memref filter testing in affine data copy optimization"),
+      llvm::cl::init(false)};
+};
+
+} // end anonymous namespace
+
+void TestAffineDataCopy::runOnFunction() {
+  // Gather all AffineForOps by loop depth.
+  DenseMap<unsigned, SmallVector<AffineForOp, 2>> depthToLoops;
+  gatherLoops(getFunction(), depthToLoops);
+  assert(depthToLoops.size() && "Loop nest not found");
+
+  // Only support tests with a single loop nest and a single innermost loop
+  // for now.
+  unsigned innermostLoopIdx = depthToLoops.size() - 2;
+  if (depthToLoops[0].size() != 1 || depthToLoops[innermostLoopIdx].size() != 1)
+    return;
+
+  auto loopNest = depthToLoops[0][0];
+  auto innermostLoop = depthToLoops[innermostLoopIdx][0];
+  Optional<Value> memrefFilter;
+  if (clMemRefFilter) {
+    // Gather MemRef filter. For simplicity, we use the first loaded memref
+    // found in the innermost loop.
+    for (auto &op : *innermostLoop.getBody()) {
+      if (auto load = dyn_cast<AffineLoadOp>(op)) {
+        memrefFilter = load.getMemRef();
+        break;
+      }
+    }
+  }
+
+  AffineCopyOptions copyOptions = {/*generateDma=*/false,
+                                   /*slowMemorySpace=*/0,
+                                   /*fastMemorySpace=*/0,
+                                   /*tagMemorySpace=*/0,
+                                   /*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
+  DenseSet<Operation *> copyNests;
+  affineDataCopyGenerate(loopNest.getBody()->begin(),
+                         std::prev(loopNest.getBody()->end()), copyOptions,
+                         memrefFilter, copyNests);
+}
+
+namespace mlir {
+void registerTestAffineDataCopyPass() {
+  PassRegistration<TestAffineDataCopy>(
+      PASS_NAME, "Tests affine data copy utility functions.");
+}
+} // namespace mlir
diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp
index 9ffa347173f65..9214fa9fc4333 100644
--- a/mlir/test/lib/Transforms/TestLoopFusion.cpp
+++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp
@@ -19,6 +19,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopFusionUtils.h"
+#include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 #include "llvm/ADT/STLExtras.h"
@@ -54,19 +55,6 @@ struct TestLoopFusion : public FunctionPass<TestLoopFusion> {
 
 } // end anonymous namespace
 
-// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
-static void
-gatherLoops(Block *block, unsigned currLoopDepth,
-            DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
-  auto &loopsAtDepth = depthToLoops[currLoopDepth];
-  for (auto &op : *block) {
-    if (auto forOp = dyn_cast<AffineForOp>(op)) {
-      loopsAtDepth.push_back(forOp);
-      gatherLoops(forOp.getBody(), currLoopDepth + 1, depthToLoops);
-    }
-  }
-}
-
 // Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths
 // in range ['loopDepth' + 1, 'maxLoopDepth'].
 // Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
@@ -194,8 +182,7 @@ void TestLoopFusion::runOnFunction() {
     do {
       depthToLoops.clear();
       // Gather all AffineForOps by loop depth.
-      for (auto &block : getFunction())
-        gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
+      gatherLoops(getFunction(), depthToLoops);
 
       // Try to fuse all combinations of src/dst loop nests in 'depthToLoops'.
     } while (iterateLoops(depthToLoops, testLoopFusionTransformation,
@@ -204,8 +191,7 @@ void TestLoopFusion::runOnFunction() {
   }
 
   // Gather all AffineForOps by loop depth.
-  for (Block &block : getFunction())
-    gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
+  gatherLoops(getFunction(), depthToLoops);
 
   // Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
   if (clTestDependenceCheck)
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index bf6b57c2b6247..4df330e77bcd8 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -34,6 +34,7 @@ void registerPassManagerTestPass();
 void registerPatternsTestPass();
 void registerSimpleParametricTilingPass();
 void registerSymbolTestPasses();
+void registerTestAffineDataCopyPass();
 void registerTestAllReduceLoweringPass();
 void registerTestCallGraphPass();
 void registerTestConstantFold();
@@ -85,6 +86,7 @@ void registerTestPasses() {
   registerPatternsTestPass();
   registerSimpleParametricTilingPass();
   registerSymbolTestPasses();
+  registerTestAffineDataCopyPass();
   registerTestAllReduceLoweringPass();
   registerTestCallGraphPass();
   registerTestConstantFold();